当前位置：网站首页>Using tsne to visualize the similarity of different sentences

Using tsne to visualize the similarity of different sentences

2022-06-30 00:49:00 【This Livermore isn't too cold】

TSNE Purpose ： Dimensionality reduction and visualization of high-dimensional data

Each data point is mapped to the corresponding probability distribution by mapping transformation . Specifically , Use in high dimensional space Gaussian distribution Convert the distance to a probability distribution , In low dimensional space , Use the long tail distribution to convert the distance to a probability distribution , Therefore, the middle and lower distances in the high-dimensional space can have a larger distance after mapping , This makes it possible to avoid paying too much attention to local features in dimensionality reduction , While ignoring global features .

import gensim, logging, os
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from cope_dataset import get_unrepeat_txt
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import nltk
import numpy as np
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
# nltk.download('punkt')
from nltk.tokenize import word_tokenize
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
import pandas as pd

def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

def tsne_plot(tokens,labels):
    """ utilize tsne Generate pictures """
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)
    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
    
    plt.figure(figsize=(32, 32))
    for i in range(len(x)):
        plt.scatter(x[i], y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()
    plt.savefig('embedding_map.png')


def kmeans_vis(sentence_embeddings):
    plt.figure(figsize=(32,32))
    clf = KMeans(n_clusters=1000)
    y_pred = KMeans(n_clusters=1000).fit_predict(sentence_embeddings)
    plt.scatter(sentence_embeddings[:, 0], sentence_embeddings[:, 1], c=y_pred)
    plt.title("Anisotropicly Disributed Blobs")
    plt.show()
    plt.savefig('kmeans.jpg')
    s = clf.fit(sentence_embeddings)
    # Get the category of all word vectors 
    labels=clf.labels_
    print(clf.cluster_centers_)


def debcans_vis(sentence_embeddings):
    plt.figure(figsize=(16,16))
    y_pred = DBSCAN(eps = 0.5, min_samples = 100).fit_predict(sentence_embeddings)
    plt.scatter(sentence_embeddings[:, 0], sentence_embeddings[:, 1], c=y_pred)
    plt.show()
    plt.savefig('debacns.jpg')
    

def train(str_en_text):
    tokenized_sent = []
    
    for s in str_en_text:
        tokenized_sent.append(word_tokenize(s.lower()))
    tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]
    sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
    sentence_embeddings = sbert_model.encode(str_en_text) #   We get the vector of each sentence 
    #kmeans_vis(sentence_embeddings)
    debcans_vis(sentence_embeddings)
    #tsne_plot(sentence_embeddings[:88],str_en_text[:88])
    #  Compare the similarity between sentences 
    #query = "Will the cat tower topple over easily？My cat is about 7-8 pounds."
    # query_vec = sbert_model.encode([query])[0]
    # for sent in str_en_text:
    #     sim = cosine(query_vec, sbert_model.encode([sent])[0])
    #     #print("Sentence = ", sent, "; similarity = ", sim)
    #     if float(sim)>0.8:
    #         break
    




if __name__ == '__main__':
    json_path = '/cloud/cloud_disk/users/huh/dataset/nlp_dataset/question_dataset/ori/en_ch_cattree_personality.json'
    str_en_text = get_unrepeat_txt()
    train(str_en_text)

原网站

版权声明
本文为[This Livermore isn't too cold]所创，转载请带上原文链接，感谢
https://yzsam.com/2022/181/202206300042037965.html

当前位置：网站首页>Using tsne to visualize the similarity of different sentences

Using tsne to visualize the similarity of different sentences

边栏推荐

猜你喜欢

随机推荐