当前位置:网站首页>Characteristic Engineering
Characteristic Engineering
2022-07-05 08:50:00 【Python code doctor】
import numpy as np
import jieba
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Imputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
def dictvec():
""" Dictionary data extraction :return: """
# Instantiation
dict1 = DictVectorizer()
# call fit_transform
data = dict1.fit_transform(
[{
'city': ' Beijing ', 'temperature': 100}, {
'city': ' Shanghai ', 'temperature': 60}, {
'city': ' Shenzhen ', 'temperature': 30}])
print(data) # sparse matrix scipy be based on numpy To save memory Easy to read and process
print(dict1.get_feature_names())
print(data.toarray()) # Two dimensional array numpy array Array
def countvec():
""" Text is characterized :return:None """
# Instantiation CountVectorizer
cv = CountVectorizer()
# call fit_transform Input conversion data
data = cv.fit_transform([" life Bitterness is short , I use python", " life Very long , no need python"])
# Print
print(cv.get_feature_names())
print(data) # sparse matrix scipy be based on numpy To save memory , Easy to read and process
print(data.toarray()) # Two dimensional array numpy
return None
def cutword():
""" participle :return: """
con1 = jieba.cut(" It's been a brutal day. , Tomorrow will be more cruel. , The day after tomorrow is beautiful , But most of them will die tomorrow night , So everyone don't give up today .")
con2 = jieba.cut(" The light we see coming from distant galaxies was emitted millions of years ago , So when we see the universe , We are looking at its past .")
con3 = jieba.cut(" If you only know something in one way , You don't really understand it . The secret of understanding the true meaning of things depends on how we relate them to what we know .")
# Convert to list
content1 = list(con1)
content2 = list(con2)
content3 = list(con3)
# Convert list to string
c1 = ' '.join(content1)
c2 = ' '.join(content2)
c3 = ' '.join(content3)
return c1, c2, c3
def hanzivec():
""" Chinese eigenvalue :return:None """
c1, c2, c3 = cutword()
print(c1, c2, c3)
# Instantiation CountVectorizer
cv = CountVectorizer()
# call fit_transform Input conversion data
data = cv.fit_transform([c1, c2, c3])
# Print
print(cv.get_feature_names())
print(data) # sparse matrix scipy be based on numpy To save memory , Easy to read and process
print(data.toarray()) # Two dimensional array numpy
return None
def tfidfvec():
""" Chinese eigenvalue :return: """
c1, c2, c3 = cutword()
print(c1, c2, c3)
# Instantiation
tf = TfidfVectorizer()
data = tf.fit_transform([c1, c2, c3])
print(data)
print(tf.get_feature_names())
print(data.toarray())
def mm():
""" normalization :return: """
mm = MinMaxScaler(feature_range=(3, 5))
data = mm.fit_transform([[90, 2, 10, 40], [60, 4, 15, 45], [75, 3, 13, 46]])
print(data)
return None
def stand():
""" Standardized treatment :return: """
std = StandardScaler()
data = std.fit_transform([[1., -1., 3.], [2., 4., 2.], [4., 6., -1.]])
print(data)
return None
def im():
""" Missing value processing :return:None """
# NaN nan
im = Imputer(missing_values='NaN', strategy='mean', axis=0)
data = im.fit_transform([[1, 2], [np.nan, 3], [7, 6]])
print(data)
def var():
""" feature selection - Remove low variance features :return: None """
var = VarianceThreshold(threshold=0.1)
data = var.fit_transform([[0, 2, 0, 3], [3, 1, 4, 3], [4, 1, 1, 3]])
print(data)
return None
def pca():
""" Principal component analysis for feature selection :return: """
# When the number of features reaches hundreds Consider data simplification The data content will also change The number of features decreases
# 1. Integers Reduce to the number of features
# 2. decimal 0-1 90% 90%-95%
pca = PCA(n_components=0.95)
data = pca.fit_transform([[2, 8, 4, 5], [6, 3, 0, 8], [5, 4, 9, 1]])
print(data)
pca()
边栏推荐
猜你喜欢
随机推荐
使用arm Neon操作,提高内存拷贝速度
[daily training] 1200 Minimum absolute difference
Business modeling | process of software model
Agile project management of project management
Ros- learn basic knowledge of 0 ROS - nodes, running ROS nodes, topics, services, etc
[daiy4] copy of JZ35 complex linked list
猜谜语啦(11)
Explore the authentication mechanism of StarUML
猜谜语啦(6)
[牛客网刷题 Day4] JZ32 从上往下打印二叉树
ROS learning 1- create workspaces and function packs
Mathematical modeling: factor analysis
Xrosstools tool installation for X-Series
Guess riddles (6)
Use and programming method of ros-8 parameters
Search data in geo database
猜谜语啦(10)
MPSoC QSPI Flash 升级办法
287. 寻找重复数-快慢指针
C语言标准函数scanf不安全的原因