当前位置:网站首页>Clip usage
Clip usage
2022-06-10 11:39:00 【MallocLu】
List of articles
Installation environment
# 1. Environmental installation pytorch
# 2. install tqdm
pip install ftfy regex tqdm
# 3. install clip
pip install git+https://github.com/openai/CLIP.git
# Intranet usage pip install git+https://github.91chi.fun/https://github.com/openai/CLIP.git
API
# 1. Return the available model
clip.available_models()
['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/[email protected]']
# 2. Return the corresponding model and image converter
model, preprocess = clip.load("ViT-B/32")
# 3. preprocess take Image convert to tensor[3, 224, 224], then unsqueeze(0) Turn into [batch_size, 3, 3, 224] Before you can import the model
image = preprocess(Image.open("CLIP.png")).unsqueeze(0)
# 4. Combine multiple sentences [batch_size] Each sentence of is transformed into a vector [batch_size, context_length]
# Add a... At the beginning of each sentence BOS(49406) EOS(49407), Then fill to the length context_length( The default value is 77)
# ( If the length is greater than context_length-2, You need to set parameters truncate=True, Then the return value is BOS Content EOS, namely EOS Not cut off )
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device) # [3, 77]
# 5. Get the features of multiple pictures
image_features = model.encode_image(image)
# 6. Get the features of multiple texts
text_features = model.encode_text(text)
# 7. obtain Multiple pictures and multiple texts Cosine similarity between (0~1)
logits_per_image, logits_per_text = model(image, text)
Each model shape&dtype
ViT-B/32
# 512 (224, 224)
image torch.Size([B, 3, 224, 224]) torch.float32
text torch.Size([B, 77]) torch.int32
image_features torch.Size([B, 512]) torch.float16
text_features torch.Size([B, 512]) torch.float16
ViT-B/16
# 512 (224, 224)
image torch.Size([B, 3, 224, 224]) torch.float32
text torch.Size([B, 77]) torch.int32
image_features torch.Size([B, 512]) torch.float16
text_features torch.Size([B, 512]) torch.float16
ViT-L/14
# 768 (224, 224)
image torch.Size([B, 3, 224, 224]) torch.float32
text torch.Size([B, 77]) torch.int32
image_features torch.Size([B, 768]) torch.float16
text_features torch.Size([B, 768]) torch.float16
ViT-L/14@336px
# 768 (336, 336)
image torch.Size([B, 3, 336, 336]) torch.float32
text torch.Size([B, 77]) torch.int32
image_features torch.Size([B, 768]) torch.float16
text_features torch.Size([B, 768]) torch.float16
Each model preprocess&tokenize
# No matter what load Which model ,clip.tokenize The return values are the same
# preprocess The difference is that the size of the returned image is different ,
# therefore ViT-B/32 ViT-B/16 ViT-L/14 Of preprocess Same return value ,ViT-L/[email protected] Of preprocess The return value is different from them
Easy to use
Example 1
import torch
import clip
from PIL import Image
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)
with torch.no_grad():
image_features = model.encode_image(image)
text_features = model.encode_text(text)
logits_per_image, logits_per_text = model(image, text)
probs = logits_per_image.softmax(dim=-1).cpu().numpy()
print("Label probs:", probs) # prints: [[0.9927937 0.00421068 0.00299572]]
Example 2
import os
import clip
import torch
from torchvision.datasets import CIFAR100
# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)
# Download the dataset
cifar100 = CIFAR100(root=os.path.expanduser("~/.cache"), download=True, train=False)
# Prepare the inputs
image, class_id = cifar100[3637]
image_input = preprocess(image).unsqueeze(0).to(device)
text_inputs = torch.cat([clip.tokenize(f"a photo of a {
c}") for c in cifar100.classes]).to(device)
# Calculate features
with torch.no_grad():
image_features = model.encode_image(image_input)
text_features = model.encode_text(text_inputs)
# Pick the top 5 most similar labels for the image
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
# Take... Here 100 It's not what it takes. , It means that the similarity is expressed as a percentage
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
values, indices = similarity[0].topk(5)
# Print the result
print("\nTop predictions:\n")
for value, index in zip(values, indices):
print(f"{
cifar100.classes[index]:>16s}: {
100 * value.item():.2f}%")
Example 3
import os
import clip
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR100
from tqdm import tqdm
# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)
# Load the dataset
root = os.path.expanduser("~/.cache")
train = CIFAR100(root, download=True, train=True, transform=preprocess)
test = CIFAR100(root, download=True, train=False, transform=preprocess)
def get_features(dataset):
all_features = []
all_labels = []
with torch.no_grad():
for images, labels in tqdm(DataLoader(dataset, batch_size=100)):
features = model.encode_image(images.to(device))
all_features.append(features)
all_labels.append(labels)
return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()
# Calculate the image features
train_features, train_labels = get_features(train)
test_features, test_labels = get_features(test)
# Perform logistic regression
classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1)
classifier.fit(train_features, train_labels)
# Evaluate using the logistic regression classifier
predictions = classifier.predict(test_features)
accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
print(f"Accuracy = {
accuracy:.3f}")
( important ) Fix or renew CLIP Parameters
About detach
# Because our model only uses CLIP Visual encoder , So we only output whether the parameters of the visual encoder have changed
# Do not open position 1 And location 2, All output False, That is, all parameters have been updated
# Only open position 1,CLIP The parameters for True,Linear by False, namely Linear Parameter update for
# Only open position 2,CLIP The parameters for Flase,Linear by True, That is, only CLIP Parameter update for
import os
import clip
from torch import nn
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
from torch.nn import functional as F
import torch
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.model, self.preprocess = clip.load('ViT-B/32', 'cpu')
self.linear = nn.Linear(512, 10)
# Location 2
# for param in self.linear.parameters():
# param.requires_grad = False
def forward(self, x):
features = self.model.encode_image(x)
# Location 1
# features = features.detach()
return self.linear(features)
net = Net()
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)
root = os.path.expanduser("~/.cache")
train = CIFAR10(root, download=True, train=True, transform=net.preprocess)
train = next(iter(DataLoader(train, batch_size=8)))
storeParam = {
}
for name, param in net.model.visual.named_parameters():
storeParam[name] = param.detach().clone()
for name, param in net.linear.named_parameters():
storeParam[name] = param.detach().clone()
for i in range(10):
out = net(train[0])
loss = F.cross_entropy(out, train[1])
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(loss.item())
for name, param in net.model.visual.named_parameters():
print(f"{
name} {
torch.equal(param, storeParam[name])}")
for name, param in net.linear.named_parameters():
print(f"{
name} {
torch.equal(param, storeParam[name])}")
CLIP The layer structure
VIT-B/32
CLIP(
# Picture related
(visual): VisionTransformer(
(conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(transformer): Transformer(
(resblocks): Sequential(
(0): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(1): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(2): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(3): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(4): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(5): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(6): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(7): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(8): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(9): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(10): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(11): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
)
(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
# Text correlation
(transformer): Transformer(
(resblocks): Sequential(
(0): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(1): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(2): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(3): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(4): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(5): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(6): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(7): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(8): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(9): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(10): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(11): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
)
)
(token_embedding): Embedding(49408, 512)
(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
VIT-B/16
CLIP(
# Picture related
(visual): VisionTransformer(
(conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(transformer): Transformer(
(resblocks): Sequential(
(0): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(1): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(2): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(3): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(4): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(5): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(6): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(7): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(8): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(9): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(10): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(11): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
)
(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
# Text correlation
(token_embedding): Embedding(49408, 512)
(transformer): Transformer(
(resblocks): Sequential(
(0): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(1): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(2): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(3): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(4): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(5): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(6): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(7): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(8): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(9): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(10): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(11): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
)
)
(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
VIT-L/14
CLIP(
# Picture related
(visual): VisionTransformer(
(conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
(ln_pre): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(transformer): Transformer(
(resblocks): Sequential(
(0): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(1): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(2): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(3): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(4): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(5): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(6): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(7): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(8): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(9): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(10): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(11): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(12): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(13): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(14): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(15): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(16): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(17): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(18): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(19): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(20): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(21): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(22): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(23): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
)
(ln_post): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
# Text correlation
(transformer): Transformer(
(resblocks): Sequential(
(0): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(1): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(2): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(3): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(4): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(5): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(6): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(7): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(8): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(9): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(10): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(11): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
)
(token_embedding): Embedding(49408, 768)
(ln_final): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
VIT-L/[email protected]
CLIP(
# Picture related
(visual): VisionTransformer(
(conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
(ln_pre): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(transformer): Transformer(
(resblocks): Sequential(
(0): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(1): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(2): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(3): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(4): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(5): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(6): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(7): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(8): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(9): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(10): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(11): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(12): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(13): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(14): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(15): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(16): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(17): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(18): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(19): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(20): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(21): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(22): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(23): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
)
(ln_post): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
# Text correlation
(transformer): Transformer(
(resblocks): Sequential(
(0): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(1): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(2): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(3): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(4): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(5): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(6): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(7): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(8): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(9): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(10): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(11): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
)
(token_embedding): Embedding(49408, 768)
(ln_final): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
CLIP Parameter structure
VIT-B
Include VIT-B/32 and VIT-B/16
# Weight when calculating cosine similarity , The detailed code is :
# normalized features
# image_features = image_features / image_features.norm(dim=1, keepdim=True)
# text_features = text_features / text_features.norm(dim=1, keepdim=True)
# # cosine similarity as logits
# logit_scale = self.logit_scale.exp()
# logits_per_image = logit_scale * image_features @ text_features.t()
# logits_per_text = logits_per_image.t()
logit_scale
# Picture related
visual.conv1.weight
visual.class_embedding
visual.positional_embedding
visual.ln_pre.weight
visual.ln_pre.bias
visual.transformer.resblocks.0.attn.in_proj_weight
visual.transformer.resblocks.0.attn.in_proj_bias
visual.transformer.resblocks.0.attn.out_proj.weight
visual.transformer.resblocks.0.attn.out_proj.bias
visual.transformer.resblocks.0.ln_1.weight
visual.transformer.resblocks.0.ln_1.bias
visual.transformer.resblocks.0.mlp.c_fc.weight
visual.transformer.resblocks.0.mlp.c_fc.bias
visual.transformer.resblocks.0.mlp.c_proj.weight
visual.transformer.resblocks.0.mlp.c_proj.bias
visual.transformer.resblocks.0.ln_2.weight
visual.transformer.resblocks.0.ln_2.bias
visual.transformer.resblocks.1.attn.in_proj_weight
visual.transformer.resblocks.1.attn.in_proj_bias
visual.transformer.resblocks.1.attn.out_proj.weight
visual.transformer.resblocks.1.attn.out_proj.bias
visual.transformer.resblocks.1.ln_1.weight
visual.transformer.resblocks.1.ln_1.bias
visual.transformer.resblocks.1.mlp.c_fc.weight
visual.transformer.resblocks.1.mlp.c_fc.bias
visual.transformer.resblocks.1.mlp.c_proj.weight
visual.transformer.resblocks.1.mlp.c_proj.bias
visual.transformer.resblocks.1.ln_2.weight
visual.transformer.resblocks.1.ln_2.bias
visual.transformer.resblocks.2.attn.in_proj_weight
visual.transformer.resblocks.2.attn.in_proj_bias
visual.transformer.resblocks.2.attn.out_proj.weight
visual.transformer.resblocks.2.attn.out_proj.bias
visual.transformer.resblocks.2.ln_1.weight
visual.transformer.resblocks.2.ln_1.bias
visual.transformer.resblocks.2.mlp.c_fc.weight
visual.transformer.resblocks.2.mlp.c_fc.bias
visual.transformer.resblocks.2.mlp.c_proj.weight
visual.transformer.resblocks.2.mlp.c_proj.bias
visual.transformer.resblocks.2.ln_2.weight
visual.transformer.resblocks.2.ln_2.bias
visual.transformer.resblocks.3.attn.in_proj_weight
visual.transformer.resblocks.3.attn.in_proj_bias
visual.transformer.resblocks.3.attn.out_proj.weight
visual.transformer.resblocks.3.attn.out_proj.bias
visual.transformer.resblocks.3.ln_1.weight
visual.transformer.resblocks.3.ln_1.bias
visual.transformer.resblocks.3.mlp.c_fc.weight
visual.transformer.resblocks.3.mlp.c_fc.bias
visual.transformer.resblocks.3.mlp.c_proj.weight
visual.transformer.resblocks.3.mlp.c_proj.bias
visual.transformer.resblocks.3.ln_2.weight
visual.transformer.resblocks.3.ln_2.bias
visual.transformer.resblocks.4.attn.in_proj_weight
visual.transformer.resblocks.4.attn.in_proj_bias
visual.transformer.resblocks.4.attn.out_proj.weight
visual.transformer.resblocks.4.attn.out_proj.bias
visual.transformer.resblocks.4.ln_1.weight
visual.transformer.resblocks.4.ln_1.bias
visual.transformer.resblocks.4.mlp.c_fc.weight
visual.transformer.resblocks.4.mlp.c_fc.bias
visual.transformer.resblocks.4.mlp.c_proj.weight
visual.transformer.resblocks.4.mlp.c_proj.bias
visual.transformer.resblocks.4.ln_2.weight
visual.transformer.resblocks.4.ln_2.bias
visual.transformer.resblocks.5.attn.in_proj_weight
visual.transformer.resblocks.5.attn.in_proj_bias
visual.transformer.resblocks.5.attn.out_proj.weight
visual.transformer.resblocks.5.attn.out_proj.bias
visual.transformer.resblocks.5.ln_1.weight
visual.transformer.resblocks.5.ln_1.bias
visual.transformer.resblocks.5.mlp.c_fc.weight
visual.transformer.resblocks.5.mlp.c_fc.bias
visual.transformer.resblocks.5.mlp.c_proj.weight
visual.transformer.resblocks.5.mlp.c_proj.bias
visual.transformer.resblocks.5.ln_2.weight
visual.transformer.resblocks.5.ln_2.bias
visual.transformer.resblocks.6.attn.in_proj_weight
visual.transformer.resblocks.6.attn.in_proj_bias
visual.transformer.resblocks.6.attn.out_proj.weight
visual.transformer.resblocks.6.attn.out_proj.bias
visual.transformer.resblocks.6.ln_1.weight
visual.transformer.resblocks.6.ln_1.bias
visual.transformer.resblocks.6.mlp.c_fc.weight
visual.transformer.resblocks.6.mlp.c_fc.bias
visual.transformer.resblocks.6.mlp.c_proj.weight
visual.transformer.resblocks.6.mlp.c_proj.bias
visual.transformer.resblocks.6.ln_2.weight
visual.transformer.resblocks.6.ln_2.bias
visual.transformer.resblocks.7.attn.in_proj_weight
visual.transformer.resblocks.7.attn.in_proj_bias
visual.transformer.resblocks.7.attn.out_proj.weight
visual.transformer.resblocks.7.attn.out_proj.bias
visual.transformer.resblocks.7.ln_1.weight
visual.transformer.resblocks.7.ln_1.bias
visual.transformer.resblocks.7.mlp.c_fc.weight
visual.transformer.resblocks.7.mlp.c_fc.bias
visual.transformer.resblocks.7.mlp.c_proj.weight
visual.transformer.resblocks.7.mlp.c_proj.bias
visual.transformer.resblocks.7.ln_2.weight
visual.transformer.resblocks.7.ln_2.bias
visual.transformer.resblocks.8.attn.in_proj_weight
visual.transformer.resblocks.8.attn.in_proj_bias
visual.transformer.resblocks.8.attn.out_proj.weight
visual.transformer.resblocks.8.attn.out_proj.bias
visual.transformer.resblocks.8.ln_1.weight
visual.transformer.resblocks.8.ln_1.bias
visual.transformer.resblocks.8.mlp.c_fc.weight
visual.transformer.resblocks.8.mlp.c_fc.bias
visual.transformer.resblocks.8.mlp.c_proj.weight
visual.transformer.resblocks.8.mlp.c_proj.bias
visual.transformer.resblocks.8.ln_2.weight
visual.transformer.resblocks.8.ln_2.bias
visual.transformer.resblocks.9.attn.in_proj_weight
visual.transformer.resblocks.9.attn.in_proj_bias
visual.transformer.resblocks.9.attn.out_proj.weight
visual.transformer.resblocks.9.attn.out_proj.bias
visual.transformer.resblocks.9.ln_1.weight
visual.transformer.resblocks.9.ln_1.bias
visual.transformer.resblocks.9.mlp.c_fc.weight
visual.transformer.resblocks.9.mlp.c_fc.bias
visual.transformer.resblocks.9.mlp.c_proj.weight
visual.transformer.resblocks.9.mlp.c_proj.bias
visual.transformer.resblocks.9.ln_2.weight
visual.transformer.resblocks.9.ln_2.bias
visual.transformer.resblocks.10.attn.in_proj_weight
visual.transformer.resblocks.10.attn.in_proj_bias
visual.transformer.resblocks.10.attn.out_proj.weight
visual.transformer.resblocks.10.attn.out_proj.bias
visual.transformer.resblocks.10.ln_1.weight
visual.transformer.resblocks.10.ln_1.bias
visual.transformer.resblocks.10.mlp.c_fc.weight
visual.transformer.resblocks.10.mlp.c_fc.bias
visual.transformer.resblocks.10.mlp.c_proj.weight
visual.transformer.resblocks.10.mlp.c_proj.bias
visual.transformer.resblocks.10.ln_2.weight
visual.transformer.resblocks.10.ln_2.bias
visual.transformer.resblocks.11.attn.in_proj_weight
visual.transformer.resblocks.11.attn.in_proj_bias
visual.transformer.resblocks.11.attn.out_proj.weight
visual.transformer.resblocks.11.attn.out_proj.bias
visual.transformer.resblocks.11.ln_1.weight
visual.transformer.resblocks.11.ln_1.bias
visual.transformer.resblocks.11.mlp.c_fc.weight
visual.transformer.resblocks.11.mlp.c_fc.bias
visual.transformer.resblocks.11.mlp.c_proj.weight
visual.transformer.resblocks.11.mlp.c_proj.bias
visual.transformer.resblocks.11.ln_2.weight
visual.transformer.resblocks.11.ln_2.bias
visual.ln_post.weight
visual.ln_post.bias
visual.proj
# Text correlation
token_embedding.weight
positional_embedding
transformer.resblocks.0.attn.in_proj_weight
transformer.resblocks.0.attn.in_proj_bias
transformer.resblocks.0.attn.out_proj.weight
transformer.resblocks.0.attn.out_proj.bias
transformer.resblocks.0.ln_1.weight
transformer.resblocks.0.ln_1.bias
transformer.resblocks.0.mlp.c_fc.weight
transformer.resblocks.0.mlp.c_fc.bias
transformer.resblocks.0.mlp.c_proj.weight
transformer.resblocks.0.mlp.c_proj.bias
transformer.resblocks.0.ln_2.weight
transformer.resblocks.0.ln_2.bias
transformer.resblocks.1.attn.in_proj_weight
transformer.resblocks.1.attn.in_proj_bias
transformer.resblocks.1.attn.out_proj.weight
transformer.resblocks.1.attn.out_proj.bias
transformer.resblocks.1.ln_1.weight
transformer.resblocks.1.ln_1.bias
transformer.resblocks.1.mlp.c_fc.weight
transformer.resblocks.1.mlp.c_fc.bias
transformer.resblocks.1.mlp.c_proj.weight
transformer.resblocks.1.mlp.c_proj.bias
transformer.resblocks.1.ln_2.weight
transformer.resblocks.1.ln_2.bias
transformer.resblocks.2.attn.in_proj_weight
transformer.resblocks.2.attn.in_proj_bias
transformer.resblocks.2.attn.out_proj.weight
transformer.resblocks.2.attn.out_proj.bias
transformer.resblocks.2.ln_1.weight
transformer.resblocks.2.ln_1.bias
transformer.resblocks.2.mlp.c_fc.weight
transformer.resblocks.2.mlp.c_fc.bias
transformer.resblocks.2.mlp.c_proj.weight
transformer.resblocks.2.mlp.c_proj.bias
transformer.resblocks.2.ln_2.weight
transformer.resblocks.2.ln_2.bias
transformer.resblocks.3.attn.in_proj_weight
transformer.resblocks.3.attn.in_proj_bias
transformer.resblocks.3.attn.out_proj.weight
transformer.resblocks.3.attn.out_proj.bias
transformer.resblocks.3.ln_1.weight
transformer.resblocks.3.ln_1.bias
transformer.resblocks.3.mlp.c_fc.weight
transformer.resblocks.3.mlp.c_fc.bias
transformer.resblocks.3.mlp.c_proj.weight
transformer.resblocks.3.mlp.c_proj.bias
transformer.resblocks.3.ln_2.weight
transformer.resblocks.3.ln_2.bias
transformer.resblocks.4.attn.in_proj_weight
transformer.resblocks.4.attn.in_proj_bias
transformer.resblocks.4.attn.out_proj.weight
transformer.resblocks.4.attn.out_proj.bias
transformer.resblocks.4.ln_1.weight
transformer.resblocks.4.ln_1.bias
transformer.resblocks.4.mlp.c_fc.weight
transformer.resblocks.4.mlp.c_fc.bias
transformer.resblocks.4.mlp.c_proj.weight
transformer.resblocks.4.mlp.c_proj.bias
transformer.resblocks.4.ln_2.weight
transformer.resblocks.4.ln_2.bias
transformer.resblocks.5.attn.in_proj_weight
transformer.resblocks.5.attn.in_proj_bias
transformer.resblocks.5.attn.out_proj.weight
transformer.resblocks.5.attn.out_proj.bias
transformer.resblocks.5.ln_1.weight
transformer.resblocks.5.ln_1.bias
transformer.resblocks.5.mlp.c_fc.weight
transformer.resblocks.5.mlp.c_fc.bias
transformer.resblocks.5.mlp.c_proj.weight
transformer.resblocks.5.mlp.c_proj.bias
transformer.resblocks.5.ln_2.weight
transformer.resblocks.5.ln_2.bias
transformer.resblocks.6.attn.in_proj_weight
transformer.resblocks.6.attn.in_proj_bias
transformer.resblocks.6.attn.out_proj.weight
transformer.resblocks.6.attn.out_proj.bias
transformer.resblocks.6.ln_1.weight
transformer.resblocks.6.ln_1.bias
transformer.resblocks.6.mlp.c_fc.weight
transformer.resblocks.6.mlp.c_fc.bias
transformer.resblocks.6.mlp.c_proj.weight
transformer.resblocks.6.mlp.c_proj.bias
transformer.resblocks.6.ln_2.weight
transformer.resblocks.6.ln_2.bias
transformer.resblocks.7.attn.in_proj_weight
transformer.resblocks.7.attn.in_proj_bias
transformer.resblocks.7.attn.out_proj.weight
transformer.resblocks.7.attn.out_proj.bias
transformer.resblocks.7.ln_1.weight
transformer.resblocks.7.ln_1.bias
transformer.resblocks.7.mlp.c_fc.weight
transformer.resblocks.7.mlp.c_fc.bias
transformer.resblocks.7.mlp.c_proj.weight
transformer.resblocks.7.mlp.c_proj.bias
transformer.resblocks.7.ln_2.weight
transformer.resblocks.7.ln_2.bias
transformer.resblocks.8.attn.in_proj_weight
transformer.resblocks.8.attn.in_proj_bias
transformer.resblocks.8.attn.out_proj.weight
transformer.resblocks.8.attn.out_proj.bias
transformer.resblocks.8.ln_1.weight
transformer.resblocks.8.ln_1.bias
transformer.resblocks.8.mlp.c_fc.weight
transformer.resblocks.8.mlp.c_fc.bias
transformer.resblocks.8.mlp.c_proj.weight
transformer.resblocks.8.mlp.c_proj.bias
transformer.resblocks.8.ln_2.weight
transformer.resblocks.8.ln_2.bias
transformer.resblocks.9.attn.in_proj_weight
transformer.resblocks.9.attn.in_proj_bias
transformer.resblocks.9.attn.out_proj.weight
transformer.resblocks.9.attn.out_proj.bias
transformer.resblocks.9.ln_1.weight
transformer.resblocks.9.ln_1.bias
transformer.resblocks.9.mlp.c_fc.weight
transformer.resblocks.9.mlp.c_fc.bias
transformer.resblocks.9.mlp.c_proj.weight
transformer.resblocks.9.mlp.c_proj.bias
transformer.resblocks.9.ln_2.weight
transformer.resblocks.9.ln_2.bias
transformer.resblocks.10.attn.in_proj_weight
transformer.resblocks.10.attn.in_proj_bias
transformer.resblocks.10.attn.out_proj.weight
transformer.resblocks.10.attn.out_proj.bias
transformer.resblocks.10.ln_1.weight
transformer.resblocks.10.ln_1.bias
transformer.resblocks.10.mlp.c_fc.weight
transformer.resblocks.10.mlp.c_fc.bias
transformer.resblocks.10.mlp.c_proj.weight
transformer.resblocks.10.mlp.c_proj.bias
transformer.resblocks.10.ln_2.weight
transformer.resblocks.10.ln_2.bias
transformer.resblocks.11.attn.in_proj_weight
transformer.resblocks.11.attn.in_proj_bias
transformer.resblocks.11.attn.out_proj.weight
transformer.resblocks.11.attn.out_proj.bias
transformer.resblocks.11.ln_1.weight
transformer.resblocks.11.ln_1.bias
transformer.resblocks.11.mlp.c_fc.weight
transformer.resblocks.11.mlp.c_fc.bias
transformer.resblocks.11.mlp.c_proj.weight
transformer.resblocks.11.mlp.c_proj.bias
transformer.resblocks.11.ln_2.weight
transformer.resblocks.11.ln_2.bias
ln_final.weight
ln_final.bias
text_projection
VIT-L
Include VIT-L/14 and VIT-L/[email protected]
logit_scale
# Picture related
visual.conv1.weight
visual.class_embedding
visual.positional_embedding
visual.ln_pre.weight
visual.ln_pre.bias
visual.transformer.resblocks.0.attn.in_proj_weight
visual.transformer.resblocks.0.attn.in_proj_bias
visual.transformer.resblocks.0.attn.out_proj.weight
visual.transformer.resblocks.0.attn.out_proj.bias
visual.transformer.resblocks.0.ln_1.weight
visual.transformer.resblocks.0.ln_1.bias
visual.transformer.resblocks.0.mlp.c_fc.weight
visual.transformer.resblocks.0.mlp.c_fc.bias
visual.transformer.resblocks.0.mlp.c_proj.weight
visual.transformer.resblocks.0.mlp.c_proj.bias
visual.transformer.resblocks.0.ln_2.weight
visual.transformer.resblocks.0.ln_2.bias
visual.transformer.resblocks.1.attn.in_proj_weight
visual.transformer.resblocks.1.attn.in_proj_bias
visual.transformer.resblocks.1.attn.out_proj.weight
visual.transformer.resblocks.1.attn.out_proj.bias
visual.transformer.resblocks.1.ln_1.weight
visual.transformer.resblocks.1.ln_1.bias
visual.transformer.resblocks.1.mlp.c_fc.weight
visual.transformer.resblocks.1.mlp.c_fc.bias
visual.transformer.resblocks.1.mlp.c_proj.weight
visual.transformer.resblocks.1.mlp.c_proj.bias
visual.transformer.resblocks.1.ln_2.weight
visual.transformer.resblocks.1.ln_2.bias
visual.transformer.resblocks.2.attn.in_proj_weight
visual.transformer.resblocks.2.attn.in_proj_bias
visual.transformer.resblocks.2.attn.out_proj.weight
visual.transformer.resblocks.2.attn.out_proj.bias
visual.transformer.resblocks.2.ln_1.weight
visual.transformer.resblocks.2.ln_1.bias
visual.transformer.resblocks.2.mlp.c_fc.weight
visual.transformer.resblocks.2.mlp.c_fc.bias
visual.transformer.resblocks.2.mlp.c_proj.weight
visual.transformer.resblocks.2.mlp.c_proj.bias
visual.transformer.resblocks.2.ln_2.weight
visual.transformer.resblocks.2.ln_2.bias
visual.transformer.resblocks.3.attn.in_proj_weight
visual.transformer.resblocks.3.attn.in_proj_bias
visual.transformer.resblocks.3.attn.out_proj.weight
visual.transformer.resblocks.3.attn.out_proj.bias
visual.transformer.resblocks.3.ln_1.weight
visual.transformer.resblocks.3.ln_1.bias
visual.transformer.resblocks.3.mlp.c_fc.weight
visual.transformer.resblocks.3.mlp.c_fc.bias
visual.transformer.resblocks.3.mlp.c_proj.weight
visual.transformer.resblocks.3.mlp.c_proj.bias
visual.transformer.resblocks.3.ln_2.weight
visual.transformer.resblocks.3.ln_2.bias
visual.transformer.resblocks.4.attn.in_proj_weight
visual.transformer.resblocks.4.attn.in_proj_bias
visual.transformer.resblocks.4.attn.out_proj.weight
visual.transformer.resblocks.4.attn.out_proj.bias
visual.transformer.resblocks.4.ln_1.weight
visual.transformer.resblocks.4.ln_1.bias
visual.transformer.resblocks.4.mlp.c_fc.weight
visual.transformer.resblocks.4.mlp.c_fc.bias
visual.transformer.resblocks.4.mlp.c_proj.weight
visual.transformer.resblocks.4.mlp.c_proj.bias
visual.transformer.resblocks.4.ln_2.weight
visual.transformer.resblocks.4.ln_2.bias
visual.transformer.resblocks.5.attn.in_proj_weight
visual.transformer.resblocks.5.attn.in_proj_bias
visual.transformer.resblocks.5.attn.out_proj.weight
visual.transformer.resblocks.5.attn.out_proj.bias
visual.transformer.resblocks.5.ln_1.weight
visual.transformer.resblocks.5.ln_1.bias
visual.transformer.resblocks.5.mlp.c_fc.weight
visual.transformer.resblocks.5.mlp.c_fc.bias
visual.transformer.resblocks.5.mlp.c_proj.weight
visual.transformer.resblocks.5.mlp.c_proj.bias
visual.transformer.resblocks.5.ln_2.weight
visual.transformer.resblocks.5.ln_2.bias
visual.transformer.resblocks.6.attn.in_proj_weight
visual.transformer.resblocks.6.attn.in_proj_bias
visual.transformer.resblocks.6.attn.out_proj.weight
visual.transformer.resblocks.6.attn.out_proj.bias
visual.transformer.resblocks.6.ln_1.weight
visual.transformer.resblocks.6.ln_1.bias
visual.transformer.resblocks.6.mlp.c_fc.weight
visual.transformer.resblocks.6.mlp.c_fc.bias
visual.transformer.resblocks.6.mlp.c_proj.weight
visual.transformer.resblocks.6.mlp.c_proj.bias
visual.transformer.resblocks.6.ln_2.weight
visual.transformer.resblocks.6.ln_2.bias
visual.transformer.resblocks.7.attn.in_proj_weight
visual.transformer.resblocks.7.attn.in_proj_bias
visual.transformer.resblocks.7.attn.out_proj.weight
visual.transformer.resblocks.7.attn.out_proj.bias
visual.transformer.resblocks.7.ln_1.weight
visual.transformer.resblocks.7.ln_1.bias
visual.transformer.resblocks.7.mlp.c_fc.weight
visual.transformer.resblocks.7.mlp.c_fc.bias
visual.transformer.resblocks.7.mlp.c_proj.weight
visual.transformer.resblocks.7.mlp.c_proj.bias
visual.transformer.resblocks.7.ln_2.weight
visual.transformer.resblocks.7.ln_2.bias
visual.transformer.resblocks.8.attn.in_proj_weight
visual.transformer.resblocks.8.attn.in_proj_bias
visual.transformer.resblocks.8.attn.out_proj.weight
visual.transformer.resblocks.8.attn.out_proj.bias
visual.transformer.resblocks.8.ln_1.weight
visual.transformer.resblocks.8.ln_1.bias
visual.transformer.resblocks.8.mlp.c_fc.weight
visual.transformer.resblocks.8.mlp.c_fc.bias
visual.transformer.resblocks.8.mlp.c_proj.weight
visual.transformer.resblocks.8.mlp.c_proj.bias
visual.transformer.resblocks.8.ln_2.weight
visual.transformer.resblocks.8.ln_2.bias
visual.transformer.resblocks.9.attn.in_proj_weight
visual.transformer.resblocks.9.attn.in_proj_bias
visual.transformer.resblocks.9.attn.out_proj.weight
visual.transformer.resblocks.9.attn.out_proj.bias
visual.transformer.resblocks.9.ln_1.weight
visual.transformer.resblocks.9.ln_1.bias
visual.transformer.resblocks.9.mlp.c_fc.weight
visual.transformer.resblocks.9.mlp.c_fc.bias
visual.transformer.resblocks.9.mlp.c_proj.weight
visual.transformer.resblocks.9.mlp.c_proj.bias
visual.transformer.resblocks.9.ln_2.weight
visual.transformer.resblocks.9.ln_2.bias
visual.transformer.resblocks.10.attn.in_proj_weight
visual.transformer.resblocks.10.attn.in_proj_bias
visual.transformer.resblocks.10.attn.out_proj.weight
visual.transformer.resblocks.10.attn.out_proj.bias
visual.transformer.resblocks.10.ln_1.weight
visual.transformer.resblocks.10.ln_1.bias
visual.transformer.resblocks.10.mlp.c_fc.weight
visual.transformer.resblocks.10.mlp.c_fc.bias
visual.transformer.resblocks.10.mlp.c_proj.weight
visual.transformer.resblocks.10.mlp.c_proj.bias
visual.transformer.resblocks.10.ln_2.weight
visual.transformer.resblocks.10.ln_2.bias
visual.transformer.resblocks.11.attn.in_proj_weight
visual.transformer.resblocks.11.attn.in_proj_bias
visual.transformer.resblocks.11.attn.out_proj.weight
visual.transformer.resblocks.11.attn.out_proj.bias
visual.transformer.resblocks.11.ln_1.weight
visual.transformer.resblocks.11.ln_1.bias
visual.transformer.resblocks.11.mlp.c_fc.weight
visual.transformer.resblocks.11.mlp.c_fc.bias
visual.transformer.resblocks.11.mlp.c_proj.weight
visual.transformer.resblocks.11.mlp.c_proj.bias
visual.transformer.resblocks.11.ln_2.weight
visual.transformer.resblocks.11.ln_2.bias
visual.transformer.resblocks.12.attn.in_proj_weight
visual.transformer.resblocks.12.attn.in_proj_bias
visual.transformer.resblocks.12.attn.out_proj.weight
visual.transformer.resblocks.12.attn.out_proj.bias
visual.transformer.resblocks.12.ln_1.weight
visual.transformer.resblocks.12.ln_1.bias
visual.transformer.resblocks.12.mlp.c_fc.weight
visual.transformer.resblocks.12.mlp.c_fc.bias
visual.transformer.resblocks.12.mlp.c_proj.weight
visual.transformer.resblocks.12.mlp.c_proj.bias
visual.transformer.resblocks.12.ln_2.weight
visual.transformer.resblocks.12.ln_2.bias
visual.transformer.resblocks.13.attn.in_proj_weight
visual.transformer.resblocks.13.attn.in_proj_bias
visual.transformer.resblocks.13.attn.out_proj.weight
visual.transformer.resblocks.13.attn.out_proj.bias
visual.transformer.resblocks.13.ln_1.weight
visual.transformer.resblocks.13.ln_1.bias
visual.transformer.resblocks.13.mlp.c_fc.weight
visual.transformer.resblocks.13.mlp.c_fc.bias
visual.transformer.resblocks.13.mlp.c_proj.weight
visual.transformer.resblocks.13.mlp.c_proj.bias
visual.transformer.resblocks.13.ln_2.weight
visual.transformer.resblocks.13.ln_2.bias
visual.transformer.resblocks.14.attn.in_proj_weight
visual.transformer.resblocks.14.attn.in_proj_bias
visual.transformer.resblocks.14.attn.out_proj.weight
visual.transformer.resblocks.14.attn.out_proj.bias
visual.transformer.resblocks.14.ln_1.weight
visual.transformer.resblocks.14.ln_1.bias
visual.transformer.resblocks.14.mlp.c_fc.weight
visual.transformer.resblocks.14.mlp.c_fc.bias
visual.transformer.resblocks.14.mlp.c_proj.weight
visual.transformer.resblocks.14.mlp.c_proj.bias
visual.transformer.resblocks.14.ln_2.weight
visual.transformer.resblocks.14.ln_2.bias
visual.transformer.resblocks.15.attn.in_proj_weight
visual.transformer.resblocks.15.attn.in_proj_bias
visual.transformer.resblocks.15.attn.out_proj.weight
visual.transformer.resblocks.15.attn.out_proj.bias
visual.transformer.resblocks.15.ln_1.weight
visual.transformer.resblocks.15.ln_1.bias
visual.transformer.resblocks.15.mlp.c_fc.weight
visual.transformer.resblocks.15.mlp.c_fc.bias
visual.transformer.resblocks.15.mlp.c_proj.weight
visual.transformer.resblocks.15.mlp.c_proj.bias
visual.transformer.resblocks.15.ln_2.weight
visual.transformer.resblocks.15.ln_2.bias
visual.transformer.resblocks.16.attn.in_proj_weight
visual.transformer.resblocks.16.attn.in_proj_bias
visual.transformer.resblocks.16.attn.out_proj.weight
visual.transformer.resblocks.16.attn.out_proj.bias
visual.transformer.resblocks.16.ln_1.weight
visual.transformer.resblocks.16.ln_1.bias
visual.transformer.resblocks.16.mlp.c_fc.weight
visual.transformer.resblocks.16.mlp.c_fc.bias
visual.transformer.resblocks.16.mlp.c_proj.weight
visual.transformer.resblocks.16.mlp.c_proj.bias
visual.transformer.resblocks.16.ln_2.weight
visual.transformer.resblocks.16.ln_2.bias
visual.transformer.resblocks.17.attn.in_proj_weight
visual.transformer.resblocks.17.attn.in_proj_bias
visual.transformer.resblocks.17.attn.out_proj.weight
visual.transformer.resblocks.17.attn.out_proj.bias
visual.transformer.resblocks.17.ln_1.weight
visual.transformer.resblocks.17.ln_1.bias
visual.transformer.resblocks.17.mlp.c_fc.weight
visual.transformer.resblocks.17.mlp.c_fc.bias
visual.transformer.resblocks.17.mlp.c_proj.weight
visual.transformer.resblocks.17.mlp.c_proj.bias
visual.transformer.resblocks.17.ln_2.weight
visual.transformer.resblocks.17.ln_2.bias
visual.transformer.resblocks.18.attn.in_proj_weight
visual.transformer.resblocks.18.attn.in_proj_bias
visual.transformer.resblocks.18.attn.out_proj.weight
visual.transformer.resblocks.18.attn.out_proj.bias
visual.transformer.resblocks.18.ln_1.weight
visual.transformer.resblocks.18.ln_1.bias
visual.transformer.resblocks.18.mlp.c_fc.weight
visual.transformer.resblocks.18.mlp.c_fc.bias
visual.transformer.resblocks.18.mlp.c_proj.weight
visual.transformer.resblocks.18.mlp.c_proj.bias
visual.transformer.resblocks.18.ln_2.weight
visual.transformer.resblocks.18.ln_2.bias
visual.transformer.resblocks.19.attn.in_proj_weight
visual.transformer.resblocks.19.attn.in_proj_bias
visual.transformer.resblocks.19.attn.out_proj.weight
visual.transformer.resblocks.19.attn.out_proj.bias
visual.transformer.resblocks.19.ln_1.weight
visual.transformer.resblocks.19.ln_1.bias
visual.transformer.resblocks.19.mlp.c_fc.weight
visual.transformer.resblocks.19.mlp.c_fc.bias
visual.transformer.resblocks.19.mlp.c_proj.weight
visual.transformer.resblocks.19.mlp.c_proj.bias
visual.transformer.resblocks.19.ln_2.weight
visual.transformer.resblocks.19.ln_2.bias
visual.transformer.resblocks.20.attn.in_proj_weight
visual.transformer.resblocks.20.attn.in_proj_bias
visual.transformer.resblocks.20.attn.out_proj.weight
visual.transformer.resblocks.20.attn.out_proj.bias
visual.transformer.resblocks.20.ln_1.weight
visual.transformer.resblocks.20.ln_1.bias
visual.transformer.resblocks.20.mlp.c_fc.weight
visual.transformer.resblocks.20.mlp.c_fc.bias
visual.transformer.resblocks.20.mlp.c_proj.weight
visual.transformer.resblocks.20.mlp.c_proj.bias
visual.transformer.resblocks.20.ln_2.weight
visual.transformer.resblocks.20.ln_2.bias
visual.transformer.resblocks.21.attn.in_proj_weight
visual.transformer.resblocks.21.attn.in_proj_bias
visual.transformer.resblocks.21.attn.out_proj.weight
visual.transformer.resblocks.21.attn.out_proj.bias
visual.transformer.resblocks.21.ln_1.weight
visual.transformer.resblocks.21.ln_1.bias
visual.transformer.resblocks.21.mlp.c_fc.weight
visual.transformer.resblocks.21.mlp.c_fc.bias
visual.transformer.resblocks.21.mlp.c_proj.weight
visual.transformer.resblocks.21.mlp.c_proj.bias
visual.transformer.resblocks.21.ln_2.weight
visual.transformer.resblocks.21.ln_2.bias
visual.transformer.resblocks.22.attn.in_proj_weight
visual.transformer.resblocks.22.attn.in_proj_bias
visual.transformer.resblocks.22.attn.out_proj.weight
visual.transformer.resblocks.22.attn.out_proj.bias
visual.transformer.resblocks.22.ln_1.weight
visual.transformer.resblocks.22.ln_1.bias
visual.transformer.resblocks.22.mlp.c_fc.weight
visual.transformer.resblocks.22.mlp.c_fc.bias
visual.transformer.resblocks.22.mlp.c_proj.weight
visual.transformer.resblocks.22.mlp.c_proj.bias
visual.transformer.resblocks.22.ln_2.weight
visual.transformer.resblocks.22.ln_2.bias
visual.transformer.resblocks.23.attn.in_proj_weight
visual.transformer.resblocks.23.attn.in_proj_bias
visual.transformer.resblocks.23.attn.out_proj.weight
visual.transformer.resblocks.23.attn.out_proj.bias
visual.transformer.resblocks.23.ln_1.weight
visual.transformer.resblocks.23.ln_1.bias
visual.transformer.resblocks.23.mlp.c_fc.weight
visual.transformer.resblocks.23.mlp.c_fc.bias
visual.transformer.resblocks.23.mlp.c_proj.weight
visual.transformer.resblocks.23.mlp.c_proj.bias
visual.transformer.resblocks.23.ln_2.weight
visual.transformer.resblocks.23.ln_2.bias
visual.ln_post.weight
visual.ln_post.bias
visual.proj
# Text correlation
token_embedding.weight
positional_embedding
transformer.resblocks.0.attn.in_proj_weight
transformer.resblocks.0.attn.in_proj_bias
transformer.resblocks.0.attn.out_proj.weight
transformer.resblocks.0.attn.out_proj.bias
transformer.resblocks.0.ln_1.weight
transformer.resblocks.0.ln_1.bias
transformer.resblocks.0.mlp.c_fc.weight
transformer.resblocks.0.mlp.c_fc.bias
transformer.resblocks.0.mlp.c_proj.weight
transformer.resblocks.0.mlp.c_proj.bias
transformer.resblocks.0.ln_2.weight
transformer.resblocks.0.ln_2.bias
transformer.resblocks.1.attn.in_proj_weight
transformer.resblocks.1.attn.in_proj_bias
transformer.resblocks.1.attn.out_proj.weight
transformer.resblocks.1.attn.out_proj.bias
transformer.resblocks.1.ln_1.weight
transformer.resblocks.1.ln_1.bias
transformer.resblocks.1.mlp.c_fc.weight
transformer.resblocks.1.mlp.c_fc.bias
transformer.resblocks.1.mlp.c_proj.weight
transformer.resblocks.1.mlp.c_proj.bias
transformer.resblocks.1.ln_2.weight
transformer.resblocks.1.ln_2.bias
transformer.resblocks.2.attn.in_proj_weight
transformer.resblocks.2.attn.in_proj_bias
transformer.resblocks.2.attn.out_proj.weight
transformer.resblocks.2.attn.out_proj.bias
transformer.resblocks.2.ln_1.weight
transformer.resblocks.2.ln_1.bias
transformer.resblocks.2.mlp.c_fc.weight
transformer.resblocks.2.mlp.c_fc.bias
transformer.resblocks.2.mlp.c_proj.weight
transformer.resblocks.2.mlp.c_proj.bias
transformer.resblocks.2.ln_2.weight
transformer.resblocks.2.ln_2.bias
transformer.resblocks.3.attn.in_proj_weight
transformer.resblocks.3.attn.in_proj_bias
transformer.resblocks.3.attn.out_proj.weight
transformer.resblocks.3.attn.out_proj.bias
transformer.resblocks.3.ln_1.weight
transformer.resblocks.3.ln_1.bias
transformer.resblocks.3.mlp.c_fc.weight
transformer.resblocks.3.mlp.c_fc.bias
transformer.resblocks.3.mlp.c_proj.weight
transformer.resblocks.3.mlp.c_proj.bias
transformer.resblocks.3.ln_2.weight
transformer.resblocks.3.ln_2.bias
transformer.resblocks.4.attn.in_proj_weight
transformer.resblocks.4.attn.in_proj_bias
transformer.resblocks.4.attn.out_proj.weight
transformer.resblocks.4.attn.out_proj.bias
transformer.resblocks.4.ln_1.weight
transformer.resblocks.4.ln_1.bias
transformer.resblocks.4.mlp.c_fc.weight
transformer.resblocks.4.mlp.c_fc.bias
transformer.resblocks.4.mlp.c_proj.weight
transformer.resblocks.4.mlp.c_proj.bias
transformer.resblocks.4.ln_2.weight
transformer.resblocks.4.ln_2.bias
transformer.resblocks.5.attn.in_proj_weight
transformer.resblocks.5.attn.in_proj_bias
transformer.resblocks.5.attn.out_proj.weight
transformer.resblocks.5.attn.out_proj.bias
transformer.resblocks.5.ln_1.weight
transformer.resblocks.5.ln_1.bias
transformer.resblocks.5.mlp.c_fc.weight
transformer.resblocks.5.mlp.c_fc.bias
transformer.resblocks.5.mlp.c_proj.weight
transformer.resblocks.5.mlp.c_proj.bias
transformer.resblocks.5.ln_2.weight
transformer.resblocks.5.ln_2.bias
transformer.resblocks.6.attn.in_proj_weight
transformer.resblocks.6.attn.in_proj_bias
transformer.resblocks.6.attn.out_proj.weight
transformer.resblocks.6.attn.out_proj.bias
transformer.resblocks.6.ln_1.weight
transformer.resblocks.6.ln_1.bias
transformer.resblocks.6.mlp.c_fc.weight
transformer.resblocks.6.mlp.c_fc.bias
transformer.resblocks.6.mlp.c_proj.weight
transformer.resblocks.6.mlp.c_proj.bias
transformer.resblocks.6.ln_2.weight
transformer.resblocks.6.ln_2.bias
transformer.resblocks.7.attn.in_proj_weight
transformer.resblocks.7.attn.in_proj_bias
transformer.resblocks.7.attn.out_proj.weight
transformer.resblocks.7.attn.out_proj.bias
transformer.resblocks.7.ln_1.weight
transformer.resblocks.7.ln_1.bias
transformer.resblocks.7.mlp.c_fc.weight
transformer.resblocks.7.mlp.c_fc.bias
transformer.resblocks.7.mlp.c_proj.weight
transformer.resblocks.7.mlp.c_proj.bias
transformer.resblocks.7.ln_2.weight
transformer.resblocks.7.ln_2.bias
transformer.resblocks.8.attn.in_proj_weight
transformer.resblocks.8.attn.in_proj_bias
transformer.resblocks.8.attn.out_proj.weight
transformer.resblocks.8.attn.out_proj.bias
transformer.resblocks.8.ln_1.weight
transformer.resblocks.8.ln_1.bias
transformer.resblocks.8.mlp.c_fc.weight
transformer.resblocks.8.mlp.c_fc.bias
transformer.resblocks.8.mlp.c_proj.weight
transformer.resblocks.8.mlp.c_proj.bias
transformer.resblocks.8.ln_2.weight
transformer.resblocks.8.ln_2.bias
transformer.resblocks.9.attn.in_proj_weight
transformer.resblocks.9.attn.in_proj_bias
transformer.resblocks.9.attn.out_proj.weight
transformer.resblocks.9.attn.out_proj.bias
transformer.resblocks.9.ln_1.weight
transformer.resblocks.9.ln_1.bias
transformer.resblocks.9.mlp.c_fc.weight
transformer.resblocks.9.mlp.c_fc.bias
transformer.resblocks.9.mlp.c_proj.weight
transformer.resblocks.9.mlp.c_proj.bias
transformer.resblocks.9.ln_2.weight
transformer.resblocks.9.ln_2.bias
transformer.resblocks.10.attn.in_proj_weight
transformer.resblocks.10.attn.in_proj_bias
transformer.resblocks.10.attn.out_proj.weight
transformer.resblocks.10.attn.out_proj.bias
transformer.resblocks.10.ln_1.weight
transformer.resblocks.10.ln_1.bias
transformer.resblocks.10.mlp.c_fc.weight
transformer.resblocks.10.mlp.c_fc.bias
transformer.resblocks.10.mlp.c_proj.weight
transformer.resblocks.10.mlp.c_proj.bias
transformer.resblocks.10.ln_2.weight
transformer.resblocks.10.ln_2.bias
transformer.resblocks.11.attn.in_proj_weight
transformer.resblocks.11.attn.in_proj_bias
transformer.resblocks.11.attn.out_proj.weight
transformer.resblocks.11.attn.out_proj.bias
transformer.resblocks.11.ln_1.weight
transformer.resblocks.11.ln_1.bias
transformer.resblocks.11.mlp.c_fc.weight
transformer.resblocks.11.mlp.c_fc.bias
transformer.resblocks.11.mlp.c_proj.weight
transformer.resblocks.11.mlp.c_proj.bias
transformer.resblocks.11.ln_2.weight
transformer.resblocks.11.ln_2.bias
ln_final.weight
ln_final.bias
text_projection
Just optimize CLIP Parameters of the latter layers of
def get_optim_params(model_name: str):
if model_name in ['ViT-B/32', 'ViT-B/16']:
return ['visual.transformer.resblocks.11.attn.in_proj_weight',
'visual.transformer.resblocks.11.attn.in_proj_bias',
'visual.transformer.resblocks.11.attn.out_proj.weight',
'visual.transformer.resblocks.11.attn.out_proj.bias',
'visual.transformer.resblocks.11.ln_1.weight',
'visual.transformer.resblocks.11.ln_1.bias',
'visual.transformer.resblocks.11.mlp.c_fc.weight',
'visual.transformer.resblocks.11.mlp.c_fc.bias',
'visual.transformer.resblocks.11.mlp.c_proj.weight',
'visual.transformer.resblocks.11.mlp.c_proj.bias',
'visual.transformer.resblocks.11.ln_2.weight',
'visual.transformer.resblocks.11.ln_2.bias',
'visual.ln_post.weight',
'visual.ln_post.bias',
'visual.proj',
'transformer.resblocks.11.attn.in_proj_weight',
'transformer.resblocks.11.attn.in_proj_bias',
'transformer.resblocks.11.attn.out_proj.weight',
'transformer.resblocks.11.attn.out_proj.bias',
'transformer.resblocks.11.ln_1.weight',
'transformer.resblocks.11.ln_1.bias',
'transformer.resblocks.11.mlp.c_fc.weight',
'transformer.resblocks.11.mlp.c_fc.bias',
'transformer.resblocks.11.mlp.c_proj.weight',
'transformer.resblocks.11.mlp.c_proj.bias',
'transformer.resblocks.11.ln_2.weight',
'transformer.resblocks.11.ln_2.bias',
'ln_final.weight',
'ln_final.bias',
'text_projection']
elif model_name in ['ViT-L/14', 'ViT-L/[email protected]']:
return ['visual.transformer.resblocks.23.attn.in_proj_weight',
'visual.transformer.resblocks.23.attn.in_proj_bias',
'visual.transformer.resblocks.23.attn.out_proj.weight',
'visual.transformer.resblocks.23.attn.out_proj.bias',
'visual.transformer.resblocks.23.ln_1.weight',
'visual.transformer.resblocks.23.ln_1.bias',
'visual.transformer.resblocks.23.mlp.c_fc.weight',
'visual.transformer.resblocks.23.mlp.c_fc.bias',
'visual.transformer.resblocks.23.mlp.c_proj.weight',
'visual.transformer.resblocks.23.mlp.c_proj.bias',
'visual.transformer.resblocks.23.ln_2.weight',
'visual.transformer.resblocks.23.ln_2.bias',
'visual.ln_post.weight',
'visual.ln_post.bias',
'visual.proj',
'transformer.resblocks.11.attn.in_proj_weight',
'transformer.resblocks.11.attn.in_proj_bias',
'transformer.resblocks.11.attn.out_proj.weight',
'transformer.resblocks.11.attn.out_proj.bias',
'transformer.resblocks.11.ln_1.weight',
'transformer.resblocks.11.ln_1.bias',
'transformer.resblocks.11.mlp.c_fc.weight',
'transformer.resblocks.11.mlp.c_fc.bias',
'transformer.resblocks.11.mlp.c_proj.weight',
'transformer.resblocks.11.mlp.c_proj.bias',
'transformer.resblocks.11.ln_2.weight',
'transformer.resblocks.11.ln_2.bias',
'ln_final.weight',
'ln_final.bias',
'text_projection']
else:
print(f"no {
model_name}")
import os
import clip
from torch import nn
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
from torch.nn import functional as F
import torch
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.model, self.preprocess = clip.load('ViT-B/32', 'cpu')
optim_params = get_optim_params('ViT-B/32'):
for name, param in self.model.named_parameters():
if name not in optim_params:
param.requires_grad = False
def forward(self, image, text):
image_features = self.model.encode_image(image)
text_features = self.model.encode_text(text)
return image_features, text_features
net = Net()
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)
root = os.path.expanduser("~/.cache")
cifar10 = CIFAR10(root, download=True, train=True, transform=net.preprocess)
train = next(iter(DataLoader(cifar10, batch_size=8)))
images = train[0]
texts = torch.cat([clip.tokenize(f"a photo of a {
cifar10.classes[c]}") for c in train[1]])
storeParam = {
}
for name, param in net.named_parameters():
storeParam[name] = param.detach().clone()
for i in range(10):
image_features, text_features = net(images, texts)
loss = F.mse_loss(image_features, text_features)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(loss.item())
for name, param in net.named_parameters():
if not torch.equal(param, storeParam[name]):
print(f"{
name}")
边栏推荐
- [how much do you know about management] apart from independent conflict, you can't do it
- “减负”,让“猪”可以飞得更高
- Dynamic programming (MID)
- Opérations de haut niveau du capteur de pythorch
- Flutter websocket example
- 并发bug之源(一)-可见性
- The facial scriptures of China Saibao
- Cvpr22 oral | Hong Kong Chinese proposed transrank: sequencing loss + self supervision =sota
- API如何检测安全配置是否有错误?
- 软件测试基础
猜你喜欢
随机推荐
Start from 0 to build a high-performance R & D full stack team
QA of some high frequency problems in oauth2 learning
如何编写产品营销策划方案
[PaperNote] Web3 Direction
Unity打字机,使文本自动滚至文本框底部
How can the team be dissolved...
87.(leaflet之家)leaflet军事标绘-直线箭头修改
Privilege application permission configuration
现场勘察制度
OAuth2学习中的一些高频问题的QA
Sword finger position operation
"Forget to learn again" shell Basics - 29. Awk built-in variables
CVPR22 Oral | 港中文提出TransRank: 排序损失+自监督=SOTA
【Question】rxjs/operator takeWhile vs takeUntil
IDEA自定义配置链接nacos
并发bug之源(一)-可见性
LVS+Keepalived高可用群集
shape颜色渐变、圆角、半圆角、边框、填充
Flink CDC 在大健云仓的实践
Day 1 stack and queue (simple)








