当前位置:网站首页>[NLP] generate word cloud
[NLP] generate word cloud
2022-07-28 22:04:00 【Du Hengzhi】
from imageio import imread
import warnings
warnings.filterwarnings("ignore")
import jieba # Word segmentation packages
import numpy #numpy Calculation package
import codecs #codecs Provided open Method to specify the language encoding of the open file , It will automatically convert to internal when it is read unicode
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
from wordcloud import WordCloud# Ci Yun Bao
# Word cloud size
matplotlib.rcParams['figure.figsize'] = (15.0, 15.0)
from wordcloud import WordCloud,ImageColorGenerator
def createSuperWordCloud(text_path,image_path):
#"./data/entertainment_news.csv"
df = pd.read_csv(text_path, encoding='utf-8')
# Remove the blank lines
df = df.dropna()
#df.head()
# Turn data into List
content=df.content.values.tolist()
segment=[]
for line in content:
try:
# list
segs=jieba.lcut(line)
for seg in segs:
# Judge whether it is empty or whether it is a newline word
if len(seg)>1 and seg!='\r\n':
segment.append(seg)
except:
print(line)
continue
words_df=pd.DataFrame({
'segment':segment})
stopwords=pd.read_csv("data/stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3 No reference at all
# First extract the participle phrase in the stop word , Then remove it
words_df=words_df[~words_df.segment.isin(stopwords.stopword)]
# This is a difficult part , Word frequency statistics
words_stat = words_df.groupby('segment').agg( Count =pd.NamedAgg(column='segment', aggfunc='size')).reset_index().sort_values(
by=' Count ', ascending=False)
# Read the picture and generate the background
bimg=imread(image_path)
# Generate the word cloud
wordcloud=WordCloud(background_color="white",mask=bimg,font_path='data/simhei.ttf',max_font_size=200)
# Generate word frequency
word_frequence = {
x[0]:x[1] for x in words_stat.head(1000).values}
wordcloud=wordcloud.fit_words(word_frequence)
# Recolor
bimgColors=ImageColorGenerator(bimg)
# Get rid of off
plt.axis("off")
# Fill in the background again
plt.imshow(wordcloud.recolor(color_func=bimgColors))
# call
createSuperWordCloud("./data/entertainment_news.csv",'image/entertainment.jpeg')
result

边栏推荐
- 基于复杂网络的大群体应急决策专家意见与信任信息融合方法及应用
- How is nanoid faster and more secure than UUID implemented? (glory Collection Edition)
- 管理区解耦架构见过吗?能帮客户搞定大难题的
- An end-to-end aspect level emotion analysis method for government app reviews based on brnn
- How many tips do you know about using mock technology to help improve test efficiency?
- 开放式耳机哪个品牌好、性价比最高的开放式耳机排名
- Matlab | basic knowledge summary I
- 拥抱开源指南
- PCB材料简单介绍
- 第 7 篇:绘制旋转立方体
猜你喜欢

90. Subset II

搞事摸鱼一天有一天

从 Web3到Web2.5,是倒退还是另辟蹊径?

AimBetter洞察您的数据库,DPM 和 APM 解决方案

Open earphone which air conduction earphone with good sound quality and recognized sound quality is recommended

Edited by vimtutor

使用Mock技术帮助提升测试效率的小tips,你知道几个?

KubeVela 1.4.x 官方文档

Kubeedge releases white paper on cloud native edge computing threat model and security protection technology

入行4年,跳槽2次,我摸透了软件测试这一行~
随机推荐
纳米金偶联抗体/蛋白试剂盒(20nm,1mg/100μg/500 μg偶联量)的制备
display 各值的区别
株洲市九方中学开展防溺水、消防安全教育培训活动
kubevela插件addons下载地址
中国科学家首次用DNA构造卷积人工神经网络,可完成32类分子模式识别任务,或用于生物标志物信号分析和诊断
Wechat applet development company, do you know how to choose?
KubeVela 1.4.x 官方文档
小程序 canvas 生成海报
基于多模态融合的非遗图片分类研究
数据插值——对不同量级的数据进行归一化
【云原生之kubernetes】在kubernetes集群下的映射外部服务—Eendpoint
kali里的powersploit、evasion、weevely等工具的杂项记录
使用百度EasyDL实现明厨亮灶厨师帽识别
入行4年,跳槽2次,我摸透了软件测试这一行~
基于知识元的外文专利文献知识描述框架
Technology selection rust post analysis
typeof原理
Vimtutor编辑
Apifox: satisfy all your fantasies about API
如何高效、精准地进行图片搜索?看看轻量化视觉预训练模型