当前位置:网站首页>特征工程
特征工程
2022-07-05 08:42:00 【python-码博士】
import numpy as np
import jieba
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Imputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
def dictvec():
""" 字典数据抽取 :return: """
# 实例化
dict1 = DictVectorizer()
# 调用fit_transform
data = dict1.fit_transform(
[{
'city': '北京', 'temperature': 100}, {
'city': '上海', 'temperature': 60}, {
'city': '深圳', 'temperature': 30}])
print(data) # sparse 矩阵 scipy 基于numpy 节约内存 方便读取处理
print(dict1.get_feature_names())
print(data.toarray()) # 二维数组 numpy array数组
def countvec():
""" 对于文本进行特征值化 :return:None """
# 实例化CountVectorizer
cv = CountVectorizer()
# 调用fit_transform输入转换数据
data = cv.fit_transform(["人生 苦短,我 用 python", "人生 漫长,不用 python"])
# 打印
print(cv.get_feature_names())
print(data) # sparse矩阵 scipy 基于numpy 节约内存,方便读取处理
print(data.toarray()) # 二维数组 numpy
return None
def cutword():
""" 分词 :return: """
con1 = jieba.cut("今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。")
con2 = jieba.cut("我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去。")
con3 = jieba.cut("如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。")
# 转换为列表
content1 = list(con1)
content2 = list(con2)
content3 = list(con3)
# 把列表转换为字符串
c1 = ' '.join(content1)
c2 = ' '.join(content2)
c3 = ' '.join(content3)
return c1, c2, c3
def hanzivec():
""" 中文特征值化 :return:None """
c1, c2, c3 = cutword()
print(c1, c2, c3)
# 实例化CountVectorizer
cv = CountVectorizer()
# 调用fit_transform输入转换数据
data = cv.fit_transform([c1, c2, c3])
# 打印
print(cv.get_feature_names())
print(data) # sparse矩阵 scipy 基于numpy 节约内存,方便读取处理
print(data.toarray()) # 二维数组 numpy
return None
def tfidfvec():
""" 中文特征值化 :return: """
c1, c2, c3 = cutword()
print(c1, c2, c3)
# 实例化
tf = TfidfVectorizer()
data = tf.fit_transform([c1, c2, c3])
print(data)
print(tf.get_feature_names())
print(data.toarray())
def mm():
""" 归一化处理 :return: """
mm = MinMaxScaler(feature_range=(3, 5))
data = mm.fit_transform([[90, 2, 10, 40], [60, 4, 15, 45], [75, 3, 13, 46]])
print(data)
return None
def stand():
""" 标准化处理 :return: """
std = StandardScaler()
data = std.fit_transform([[1., -1., 3.], [2., 4., 2.], [4., 6., -1.]])
print(data)
return None
def im():
""" 缺失值处理 :return:None """
# NaN nan
im = Imputer(missing_values='NaN', strategy='mean', axis=0)
data = im.fit_transform([[1, 2], [np.nan, 3], [7, 6]])
print(data)
def var():
""" 特征选择-删除低方差的特征 :return: None """
var = VarianceThreshold(threshold=0.1)
data = var.fit_transform([[0, 2, 0, 3], [3, 1, 4, 3], [4, 1, 1, 3]])
print(data)
return None
def pca():
""" 主成分分析进行特征选择 :return: """
# 特征数量达到上百的时候 考虑数据简化 数据内容也会变 特征数量减少
# 1.整数 减少到特征数量
# 2.小数 0-1 90% 90%-95%
pca = PCA(n_components=0.95)
data = pca.fit_transform([[2, 8, 4, 5], [6, 3, 0, 8], [5, 4, 9, 1]])
print(data)
pca()
边栏推荐
- Example 010: time to show
- Guess riddles (11)
- How can fresh students write resumes to attract HR and interviewers
- Example 009: pause output for one second
- Yolov4 target detection backbone
- C# LINQ源码分析之Count
- Business modeling of software model | object modeling
- Infix expression evaluation
- Esphone Feixun DC1 soft change access homeassstant
- Low code platform | apaas platform construction analysis
猜你喜欢
实例003:完全平方数 一个整数,它加上100后是一个完全平方数,再加上168又是一个完全平方数,请问该数是多少?
Lori remote control commissioning record
STM32---ADC
Pytorch entry record
Explore the authentication mechanism of StarUML
Sword finger offer 05 Replace spaces
Business modeling of software model | object modeling
剑指 Offer 05. 替换空格
Halcon affine transformations to regions
[matlab] matlab reads and writes Excel
随机推荐
每日一题——输入一个日期,输出它是该年的第几天
猜谜语啦(7)
C language data type replacement
Guess riddles (3)
猜谜语啦(5)
Dynamic dimensions required for input: input, but no shapes were provided. Automatically overriding
[牛客网刷题 Day4] JZ32 从上往下打印二叉树
Halcon snap, get the area and position of coins
2020-05-21
Example 007: copy data from one list to another list.
MATLAB skills (28) Fuzzy Comprehensive Evaluation
Example 008: 99 multiplication table
Bluebridge cup internet of things basic graphic tutorial - GPIO input key control LD5 on and off
【三层架构及JDBC总结】
Program error record 1:valueerror: invalid literal for int() with base 10: '2.3‘
Daily question - input a date and output the day of the year
剑指 Offer 05. 替换空格
696. 计数二进制子串
How apaas is applied in different organizational structures
Example 005: three numbers sorting input three integers x, y, Z, please output these three numbers from small to large.