当前位置:网站首页>Characteristic Engineering
Characteristic Engineering
2022-07-05 08:50:00 【Python code doctor】
import numpy as np
import jieba
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Imputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
def dictvec():
""" Dictionary data extraction :return: """
# Instantiation
dict1 = DictVectorizer()
# call fit_transform
data = dict1.fit_transform(
[{
'city': ' Beijing ', 'temperature': 100}, {
'city': ' Shanghai ', 'temperature': 60}, {
'city': ' Shenzhen ', 'temperature': 30}])
print(data) # sparse matrix scipy be based on numpy To save memory Easy to read and process
print(dict1.get_feature_names())
print(data.toarray()) # Two dimensional array numpy array Array
def countvec():
""" Text is characterized :return:None """
# Instantiation CountVectorizer
cv = CountVectorizer()
# call fit_transform Input conversion data
data = cv.fit_transform([" life Bitterness is short , I use python", " life Very long , no need python"])
# Print
print(cv.get_feature_names())
print(data) # sparse matrix scipy be based on numpy To save memory , Easy to read and process
print(data.toarray()) # Two dimensional array numpy
return None
def cutword():
""" participle :return: """
con1 = jieba.cut(" It's been a brutal day. , Tomorrow will be more cruel. , The day after tomorrow is beautiful , But most of them will die tomorrow night , So everyone don't give up today .")
con2 = jieba.cut(" The light we see coming from distant galaxies was emitted millions of years ago , So when we see the universe , We are looking at its past .")
con3 = jieba.cut(" If you only know something in one way , You don't really understand it . The secret of understanding the true meaning of things depends on how we relate them to what we know .")
# Convert to list
content1 = list(con1)
content2 = list(con2)
content3 = list(con3)
# Convert list to string
c1 = ' '.join(content1)
c2 = ' '.join(content2)
c3 = ' '.join(content3)
return c1, c2, c3
def hanzivec():
""" Chinese eigenvalue :return:None """
c1, c2, c3 = cutword()
print(c1, c2, c3)
# Instantiation CountVectorizer
cv = CountVectorizer()
# call fit_transform Input conversion data
data = cv.fit_transform([c1, c2, c3])
# Print
print(cv.get_feature_names())
print(data) # sparse matrix scipy be based on numpy To save memory , Easy to read and process
print(data.toarray()) # Two dimensional array numpy
return None
def tfidfvec():
""" Chinese eigenvalue :return: """
c1, c2, c3 = cutword()
print(c1, c2, c3)
# Instantiation
tf = TfidfVectorizer()
data = tf.fit_transform([c1, c2, c3])
print(data)
print(tf.get_feature_names())
print(data.toarray())
def mm():
""" normalization :return: """
mm = MinMaxScaler(feature_range=(3, 5))
data = mm.fit_transform([[90, 2, 10, 40], [60, 4, 15, 45], [75, 3, 13, 46]])
print(data)
return None
def stand():
""" Standardized treatment :return: """
std = StandardScaler()
data = std.fit_transform([[1., -1., 3.], [2., 4., 2.], [4., 6., -1.]])
print(data)
return None
def im():
""" Missing value processing :return:None """
# NaN nan
im = Imputer(missing_values='NaN', strategy='mean', axis=0)
data = im.fit_transform([[1, 2], [np.nan, 3], [7, 6]])
print(data)
def var():
""" feature selection - Remove low variance features :return: None """
var = VarianceThreshold(threshold=0.1)
data = var.fit_transform([[0, 2, 0, 3], [3, 1, 4, 3], [4, 1, 1, 3]])
print(data)
return None
def pca():
""" Principal component analysis for feature selection :return: """
# When the number of features reaches hundreds Consider data simplification The data content will also change The number of features decreases
# 1. Integers Reduce to the number of features
# 2. decimal 0-1 90% 90%-95%
pca = PCA(n_components=0.95)
data = pca.fit_transform([[2, 8, 4, 5], [6, 3, 0, 8], [5, 4, 9, 1]])
print(data)
pca()
边栏推荐
- Typical low code apaas manufacturer cases
- 特征工程
- Classification of plastic surgery: short in long long long
- 容易混淆的基本概念 成员变量 局部变量 全局变量
- Arduino+a4988 control stepper motor
- Array,Date,String 对象方法
- Use arm neon operation to improve memory copy speed
- [daily training] 1200 Minimum absolute difference
- Programming implementation of ROS learning 5-client node
- [daiy4] copy of JZ35 complex linked list
猜你喜欢
随机推荐
287. 寻找重复数-快慢指针
Halcon color recognition_ fuses. hdev:classify fuses by color
Hello everyone, welcome to my CSDN blog!
696. 计数二进制子串
RT-Thread内核快速入门,内核实现与应用开发学习随笔记
Affected tree (tree DP)
Redis实现高性能的全文搜索引擎---RediSearch
Typescript hands-on tutorial, easy to understand
asp.net(c#)的货币格式化
Business modeling of software model | overview
319. 灯泡开关
Halcon Chinese character recognition
Classification of plastic surgery: short in long long long
Halcon snap, get the area and position of coins
Arduino+a4988 control stepper motor
暑假第一周
How apaas is applied in different organizational structures
Lori remote control LEGO motor
js异步错误处理
资源变现小程序添加折扣充值和折扣影票插件





![[牛客网刷题 Day4] JZ55 二叉树的深度](/img/f7/ca8ad43b8d9bf13df949b2f00f6d6c.png)


