当前位置:网站首页>Characteristic Engineering
Characteristic Engineering
2022-07-05 08:50:00 【Python code doctor】
import numpy as np
import jieba
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Imputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
def dictvec():
""" Dictionary data extraction :return: """
# Instantiation
dict1 = DictVectorizer()
# call fit_transform
data = dict1.fit_transform(
[{
'city': ' Beijing ', 'temperature': 100}, {
'city': ' Shanghai ', 'temperature': 60}, {
'city': ' Shenzhen ', 'temperature': 30}])
print(data) # sparse matrix scipy be based on numpy To save memory Easy to read and process
print(dict1.get_feature_names())
print(data.toarray()) # Two dimensional array numpy array Array
def countvec():
""" Text is characterized :return:None """
# Instantiation CountVectorizer
cv = CountVectorizer()
# call fit_transform Input conversion data
data = cv.fit_transform([" life Bitterness is short , I use python", " life Very long , no need python"])
# Print
print(cv.get_feature_names())
print(data) # sparse matrix scipy be based on numpy To save memory , Easy to read and process
print(data.toarray()) # Two dimensional array numpy
return None
def cutword():
""" participle :return: """
con1 = jieba.cut(" It's been a brutal day. , Tomorrow will be more cruel. , The day after tomorrow is beautiful , But most of them will die tomorrow night , So everyone don't give up today .")
con2 = jieba.cut(" The light we see coming from distant galaxies was emitted millions of years ago , So when we see the universe , We are looking at its past .")
con3 = jieba.cut(" If you only know something in one way , You don't really understand it . The secret of understanding the true meaning of things depends on how we relate them to what we know .")
# Convert to list
content1 = list(con1)
content2 = list(con2)
content3 = list(con3)
# Convert list to string
c1 = ' '.join(content1)
c2 = ' '.join(content2)
c3 = ' '.join(content3)
return c1, c2, c3
def hanzivec():
""" Chinese eigenvalue :return:None """
c1, c2, c3 = cutword()
print(c1, c2, c3)
# Instantiation CountVectorizer
cv = CountVectorizer()
# call fit_transform Input conversion data
data = cv.fit_transform([c1, c2, c3])
# Print
print(cv.get_feature_names())
print(data) # sparse matrix scipy be based on numpy To save memory , Easy to read and process
print(data.toarray()) # Two dimensional array numpy
return None
def tfidfvec():
""" Chinese eigenvalue :return: """
c1, c2, c3 = cutword()
print(c1, c2, c3)
# Instantiation
tf = TfidfVectorizer()
data = tf.fit_transform([c1, c2, c3])
print(data)
print(tf.get_feature_names())
print(data.toarray())
def mm():
""" normalization :return: """
mm = MinMaxScaler(feature_range=(3, 5))
data = mm.fit_transform([[90, 2, 10, 40], [60, 4, 15, 45], [75, 3, 13, 46]])
print(data)
return None
def stand():
""" Standardized treatment :return: """
std = StandardScaler()
data = std.fit_transform([[1., -1., 3.], [2., 4., 2.], [4., 6., -1.]])
print(data)
return None
def im():
""" Missing value processing :return:None """
# NaN nan
im = Imputer(missing_values='NaN', strategy='mean', axis=0)
data = im.fit_transform([[1, 2], [np.nan, 3], [7, 6]])
print(data)
def var():
""" feature selection - Remove low variance features :return: None """
var = VarianceThreshold(threshold=0.1)
data = var.fit_transform([[0, 2, 0, 3], [3, 1, 4, 3], [4, 1, 1, 3]])
print(data)
return None
def pca():
""" Principal component analysis for feature selection :return: """
# When the number of features reaches hundreds Consider data simplification The data content will also change The number of features decreases
# 1. Integers Reduce to the number of features
# 2. decimal 0-1 90% 90%-95%
pca = PCA(n_components=0.95)
data = pca.fit_transform([[2, 8, 4, 5], [6, 3, 0, 8], [5, 4, 9, 1]])
print(data)
pca()
边栏推荐
- golang 基础 ——map、数组、切片 存放不同类型的数据
- [daily training -- Tencent selected 50] 557 Reverse word III in string
- EA introduction notes
- Business modeling | process of software model
- Programming implementation of ROS learning 2 publisher node
- Halcon clolor_ pieces. Hedv: classifier_ Color recognition
- Classification of plastic surgery: short in long long long
- Guess riddles (2)
- Tips 1: Web video playback code
- Task failed task_ 1641530057069_ 0002_ m_ 000000
猜你喜欢
Guess riddles (7)
Typescript hands-on tutorial, easy to understand
Guess riddles (11)
Halcon blob analysis (ball.hdev)
Business modeling of software model | object modeling
Explore the authentication mechanism of StarUML
Halcon: check of blob analysis_ Blister capsule detection
Guess riddles (2)
Numpy 小坑:维度 (n, 1) 和 维度 (n, ) 数组相加运算后维度变为 (n, n)
Low code platform | apaas platform construction analysis
随机推荐
Shift operation of complement
Wechat H5 official account to get openid climbing account
TF coordinate transformation of common components of ros-9 ROS
Warning: retrying occurs during PIP installation
Programming implementation of ROS learning 2 publisher node
Guess riddles (142)
猜谜语啦(9)
Halcon clolor_ pieces. Hedv: classifier_ Color recognition
暑假第一周
Bit operation related operations
猜谜语啦(7)
[matlab] matlab reads and writes Excel
【日常训练--腾讯精选50】557. 反转字符串中的单词 III
[daiy4] jz32 print binary tree from top to bottom
Several problems to be considered and solved in the design of multi tenant architecture
C language data type replacement
资源变现小程序添加折扣充值和折扣影票插件
Numpy 小坑:维度 (n, 1) 和 维度 (n, ) 数组相加运算后维度变为 (n, n)
Mathematical modeling: factor analysis
LLVM之父Chris Lattner:为什么我们要重建AI基础设施软件