当前位置:网站首页>Characteristic Engineering
Characteristic Engineering
2022-07-05 08:50:00 【Python code doctor】
import numpy as np
import jieba
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Imputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
def dictvec():
""" Dictionary data extraction :return: """
# Instantiation
dict1 = DictVectorizer()
# call fit_transform
data = dict1.fit_transform(
[{
'city': ' Beijing ', 'temperature': 100}, {
'city': ' Shanghai ', 'temperature': 60}, {
'city': ' Shenzhen ', 'temperature': 30}])
print(data) # sparse matrix scipy be based on numpy To save memory Easy to read and process
print(dict1.get_feature_names())
print(data.toarray()) # Two dimensional array numpy array Array
def countvec():
""" Text is characterized :return:None """
# Instantiation CountVectorizer
cv = CountVectorizer()
# call fit_transform Input conversion data
data = cv.fit_transform([" life Bitterness is short , I use python", " life Very long , no need python"])
# Print
print(cv.get_feature_names())
print(data) # sparse matrix scipy be based on numpy To save memory , Easy to read and process
print(data.toarray()) # Two dimensional array numpy
return None
def cutword():
""" participle :return: """
con1 = jieba.cut(" It's been a brutal day. , Tomorrow will be more cruel. , The day after tomorrow is beautiful , But most of them will die tomorrow night , So everyone don't give up today .")
con2 = jieba.cut(" The light we see coming from distant galaxies was emitted millions of years ago , So when we see the universe , We are looking at its past .")
con3 = jieba.cut(" If you only know something in one way , You don't really understand it . The secret of understanding the true meaning of things depends on how we relate them to what we know .")
# Convert to list
content1 = list(con1)
content2 = list(con2)
content3 = list(con3)
# Convert list to string
c1 = ' '.join(content1)
c2 = ' '.join(content2)
c3 = ' '.join(content3)
return c1, c2, c3
def hanzivec():
""" Chinese eigenvalue :return:None """
c1, c2, c3 = cutword()
print(c1, c2, c3)
# Instantiation CountVectorizer
cv = CountVectorizer()
# call fit_transform Input conversion data
data = cv.fit_transform([c1, c2, c3])
# Print
print(cv.get_feature_names())
print(data) # sparse matrix scipy be based on numpy To save memory , Easy to read and process
print(data.toarray()) # Two dimensional array numpy
return None
def tfidfvec():
""" Chinese eigenvalue :return: """
c1, c2, c3 = cutword()
print(c1, c2, c3)
# Instantiation
tf = TfidfVectorizer()
data = tf.fit_transform([c1, c2, c3])
print(data)
print(tf.get_feature_names())
print(data.toarray())
def mm():
""" normalization :return: """
mm = MinMaxScaler(feature_range=(3, 5))
data = mm.fit_transform([[90, 2, 10, 40], [60, 4, 15, 45], [75, 3, 13, 46]])
print(data)
return None
def stand():
""" Standardized treatment :return: """
std = StandardScaler()
data = std.fit_transform([[1., -1., 3.], [2., 4., 2.], [4., 6., -1.]])
print(data)
return None
def im():
""" Missing value processing :return:None """
# NaN nan
im = Imputer(missing_values='NaN', strategy='mean', axis=0)
data = im.fit_transform([[1, 2], [np.nan, 3], [7, 6]])
print(data)
def var():
""" feature selection - Remove low variance features :return: None """
var = VarianceThreshold(threshold=0.1)
data = var.fit_transform([[0, 2, 0, 3], [3, 1, 4, 3], [4, 1, 1, 3]])
print(data)
return None
def pca():
""" Principal component analysis for feature selection :return: """
# When the number of features reaches hundreds Consider data simplification The data content will also change The number of features decreases
# 1. Integers Reduce to the number of features
# 2. decimal 0-1 90% 90%-95%
pca = PCA(n_components=0.95)
data = pca.fit_transform([[2, 8, 4, 5], [6, 3, 0, 8], [5, 4, 9, 1]])
print(data)
pca()
边栏推荐
- How apaas is applied in different organizational structures
- Chapter 18 using work queue manager (1)
- 微信H5公众号获取openid爬坑记
- Halcon clolor_ pieces. Hedv: classifier_ Color recognition
- Wechat H5 official account to get openid climbing account
- golang 基础 —— golang 向 mysql 插入的时间数据和本地时间不一致
- 猜谜语啦(3)
- MPSoC QSPI Flash 升级办法
- Meta标签详解
- Install the CPU version of tensorflow+cuda+cudnn (ultra detailed)
猜你喜欢
猜谜语啦(11)
TF coordinate transformation of common components of ros-9 ROS
C#【必备技能篇】ConfigurationManager 类的使用(文件App.config的使用)
It cold knowledge (updating ing~)
猜谜语啦(4)
STM32 lights up the 1.8-inch screen under Arduino IDE
Typescript hands-on tutorial, easy to understand
Halcon blob analysis (ball.hdev)
Yolov4 target detection backbone
Redis implements a high-performance full-text search engine -- redisearch
随机推荐
Programming implementation of ROS learning 2 publisher node
U8g2 drawing
[daily training -- Tencent selected 50] 557 Reverse word III in string
Guess riddles (142)
Speech recognition learning summary
Typical low code apaas manufacturer cases
猜谜语啦(11)
Array, date, string object method
Program error record 1:valueerror: invalid literal for int() with base 10: '2.3‘
Arduino+a4988 control stepper motor
TypeScript手把手教程,简单易懂
My university
How apaas is applied in different organizational structures
猜谜语啦(3)
Tips 1: Web video playback code
kubeadm系列-00-overview
js异步错误处理
The first week of summer vacation
12、动态链接库,dll
猜谜语啦(6)