当前位置：网站首页>[NLP] generate word cloud

[NLP] generate word cloud

2022-07-28 22:04:00 【Du Hengzhi】

from imageio import imread
import warnings
warnings.filterwarnings("ignore")
import jieba    # Word segmentation packages 
import numpy    #numpy Calculation package 
import codecs   #codecs Provided open Method to specify the language encoding of the open file , It will automatically convert to internal when it is read unicode 
import pandas as pd  
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
from wordcloud import WordCloud# Ci Yun Bao 
# Word cloud size 
matplotlib.rcParams['figure.figsize'] = (15.0, 15.0)
 
from wordcloud import WordCloud,ImageColorGenerator
 
def createSuperWordCloud(text_path,image_path):
#"./data/entertainment_news.csv"
    df = pd.read_csv(text_path, encoding='utf-8')
    #  Remove the blank lines 
    df = df.dropna()
#df.head()
    # Turn data into List
    content=df.content.values.tolist()
    segment=[]
    for line in content:
        try:
            # list 
            segs=jieba.lcut(line) 
            for seg in segs:
                # Judge whether it is empty or whether it is a newline word 
                if len(seg)>1 and seg!='\r\n':
                    segment.append(seg)
        except:
            print(line)
            continue
    words_df=pd.DataFrame({
    'segment':segment})
 stopwords=pd.read_csv("data/stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3 No reference at all 

# First extract the participle phrase in the stop word , Then remove it 
    words_df=words_df[~words_df.segment.isin(stopwords.stopword)]
    #  This is a difficult part , Word frequency statistics 
    words_stat = words_df.groupby('segment').agg( Count =pd.NamedAgg(column='segment', aggfunc='size')).reset_index().sort_values(
    by=' Count ', ascending=False)
    # Read the picture and generate the background 
    bimg=imread(image_path)
    #  Generate the word cloud 
wordcloud=WordCloud(background_color="white",mask=bimg,font_path='data/simhei.ttf',max_font_size=200)
    # Generate word frequency 
    word_frequence = {
    x[0]:x[1] for x in words_stat.head(1000).values}
    wordcloud=wordcloud.fit_words(word_frequence)
    #  Recolor 
    bimgColors=ImageColorGenerator(bimg)
    #  Get rid of off
    plt.axis("off")
    # Fill in the background again 
    plt.imshow(wordcloud.recolor(color_func=bimgColors))

 #  call  
createSuperWordCloud("./data/entertainment_news.csv",'image/entertainment.jpeg')