当前位置:网站首页>Characteristic Engineering
Characteristic Engineering
2022-07-05 08:50:00 【Python code doctor】
import numpy as np
import jieba
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Imputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
def dictvec():
""" Dictionary data extraction :return: """
# Instantiation
dict1 = DictVectorizer()
# call fit_transform
data = dict1.fit_transform(
[{
'city': ' Beijing ', 'temperature': 100}, {
'city': ' Shanghai ', 'temperature': 60}, {
'city': ' Shenzhen ', 'temperature': 30}])
print(data) # sparse matrix scipy be based on numpy To save memory Easy to read and process
print(dict1.get_feature_names())
print(data.toarray()) # Two dimensional array numpy array Array
def countvec():
""" Text is characterized :return:None """
# Instantiation CountVectorizer
cv = CountVectorizer()
# call fit_transform Input conversion data
data = cv.fit_transform([" life Bitterness is short , I use python", " life Very long , no need python"])
# Print
print(cv.get_feature_names())
print(data) # sparse matrix scipy be based on numpy To save memory , Easy to read and process
print(data.toarray()) # Two dimensional array numpy
return None
def cutword():
""" participle :return: """
con1 = jieba.cut(" It's been a brutal day. , Tomorrow will be more cruel. , The day after tomorrow is beautiful , But most of them will die tomorrow night , So everyone don't give up today .")
con2 = jieba.cut(" The light we see coming from distant galaxies was emitted millions of years ago , So when we see the universe , We are looking at its past .")
con3 = jieba.cut(" If you only know something in one way , You don't really understand it . The secret of understanding the true meaning of things depends on how we relate them to what we know .")
# Convert to list
content1 = list(con1)
content2 = list(con2)
content3 = list(con3)
# Convert list to string
c1 = ' '.join(content1)
c2 = ' '.join(content2)
c3 = ' '.join(content3)
return c1, c2, c3
def hanzivec():
""" Chinese eigenvalue :return:None """
c1, c2, c3 = cutword()
print(c1, c2, c3)
# Instantiation CountVectorizer
cv = CountVectorizer()
# call fit_transform Input conversion data
data = cv.fit_transform([c1, c2, c3])
# Print
print(cv.get_feature_names())
print(data) # sparse matrix scipy be based on numpy To save memory , Easy to read and process
print(data.toarray()) # Two dimensional array numpy
return None
def tfidfvec():
""" Chinese eigenvalue :return: """
c1, c2, c3 = cutword()
print(c1, c2, c3)
# Instantiation
tf = TfidfVectorizer()
data = tf.fit_transform([c1, c2, c3])
print(data)
print(tf.get_feature_names())
print(data.toarray())
def mm():
""" normalization :return: """
mm = MinMaxScaler(feature_range=(3, 5))
data = mm.fit_transform([[90, 2, 10, 40], [60, 4, 15, 45], [75, 3, 13, 46]])
print(data)
return None
def stand():
""" Standardized treatment :return: """
std = StandardScaler()
data = std.fit_transform([[1., -1., 3.], [2., 4., 2.], [4., 6., -1.]])
print(data)
return None
def im():
""" Missing value processing :return:None """
# NaN nan
im = Imputer(missing_values='NaN', strategy='mean', axis=0)
data = im.fit_transform([[1, 2], [np.nan, 3], [7, 6]])
print(data)
def var():
""" feature selection - Remove low variance features :return: None """
var = VarianceThreshold(threshold=0.1)
data = var.fit_transform([[0, 2, 0, 3], [3, 1, 4, 3], [4, 1, 1, 3]])
print(data)
return None
def pca():
""" Principal component analysis for feature selection :return: """
# When the number of features reaches hundreds Consider data simplification The data content will also change The number of features decreases
# 1. Integers Reduce to the number of features
# 2. decimal 0-1 90% 90%-95%
pca = PCA(n_components=0.95)
data = pca.fit_transform([[2, 8, 4, 5], [6, 3, 0, 8], [5, 4, 9, 1]])
print(data)
pca()
边栏推荐
- [牛客网刷题 Day4] JZ55 二叉树的深度
- Program error record 1:valueerror: invalid literal for int() with base 10: '2.3‘
- Count of C # LINQ source code analysis
- 696. 计数二进制子串
- The location search property gets the login user name
- kubeadm系列-01-preflight究竟有多少check
- Old Wang's esp8266 and old Wu's ws2818 light strip
- 资源变现小程序添加折扣充值和折扣影票插件
- Ros-10 roslaunch summary
- 猜谜语啦(11)
猜你喜欢

猜谜语啦(9)

C# LINQ源码分析之Count

Programming implementation of ROS learning 5-client node
![C [essential skills] use of configurationmanager class (use of file app.config)](/img/8b/e56f87c2d0fbbb1251ec01b99204a1.png)
C [essential skills] use of configurationmanager class (use of file app.config)

Agile project management of project management

Business modeling | process of software model

IT冷知识(更新ing~)

C#【必备技能篇】ConfigurationManager 类的使用(文件App.config的使用)

Programming implementation of ROS learning 6 -service node

TypeScript手把手教程,简单易懂
随机推荐
C language data type replacement
Affected tree (tree DP)
Arrangement of some library files
Program error record 1:valueerror: invalid literal for int() with base 10: '2.3‘
[matlab] matlab reads and writes Excel
Hello everyone, welcome to my CSDN blog!
Business modeling of software model | vision
IT冷知识(更新ing~)
How apaas is applied in different organizational structures
Ecmascript6 introduction and environment construction
U8g2 drawing
Guess riddles (142)
Classification of plastic surgery: short in long long long
696. Count binary substring
Halcon shape_ trans
Halcon color recognition_ fuses. hdev:classify fuses by color
Business modeling of software model | stakeholders
Illustration of eight classic pointer written test questions
TypeScript手把手教程,简单易懂
The first week of summer vacation