当前位置:网站首页>特征工程
特征工程
2022-07-05 08:42:00 【python-码博士】
import numpy as np
import jieba
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Imputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
def dictvec():
""" 字典数据抽取 :return: """
# 实例化
dict1 = DictVectorizer()
# 调用fit_transform
data = dict1.fit_transform(
[{
'city': '北京', 'temperature': 100}, {
'city': '上海', 'temperature': 60}, {
'city': '深圳', 'temperature': 30}])
print(data) # sparse 矩阵 scipy 基于numpy 节约内存 方便读取处理
print(dict1.get_feature_names())
print(data.toarray()) # 二维数组 numpy array数组
def countvec():
""" 对于文本进行特征值化 :return:None """
# 实例化CountVectorizer
cv = CountVectorizer()
# 调用fit_transform输入转换数据
data = cv.fit_transform(["人生 苦短,我 用 python", "人生 漫长,不用 python"])
# 打印
print(cv.get_feature_names())
print(data) # sparse矩阵 scipy 基于numpy 节约内存,方便读取处理
print(data.toarray()) # 二维数组 numpy
return None
def cutword():
""" 分词 :return: """
con1 = jieba.cut("今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。")
con2 = jieba.cut("我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去。")
con3 = jieba.cut("如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。")
# 转换为列表
content1 = list(con1)
content2 = list(con2)
content3 = list(con3)
# 把列表转换为字符串
c1 = ' '.join(content1)
c2 = ' '.join(content2)
c3 = ' '.join(content3)
return c1, c2, c3
def hanzivec():
""" 中文特征值化 :return:None """
c1, c2, c3 = cutword()
print(c1, c2, c3)
# 实例化CountVectorizer
cv = CountVectorizer()
# 调用fit_transform输入转换数据
data = cv.fit_transform([c1, c2, c3])
# 打印
print(cv.get_feature_names())
print(data) # sparse矩阵 scipy 基于numpy 节约内存,方便读取处理
print(data.toarray()) # 二维数组 numpy
return None
def tfidfvec():
""" 中文特征值化 :return: """
c1, c2, c3 = cutword()
print(c1, c2, c3)
# 实例化
tf = TfidfVectorizer()
data = tf.fit_transform([c1, c2, c3])
print(data)
print(tf.get_feature_names())
print(data.toarray())
def mm():
""" 归一化处理 :return: """
mm = MinMaxScaler(feature_range=(3, 5))
data = mm.fit_transform([[90, 2, 10, 40], [60, 4, 15, 45], [75, 3, 13, 46]])
print(data)
return None
def stand():
""" 标准化处理 :return: """
std = StandardScaler()
data = std.fit_transform([[1., -1., 3.], [2., 4., 2.], [4., 6., -1.]])
print(data)
return None
def im():
""" 缺失值处理 :return:None """
# NaN nan
im = Imputer(missing_values='NaN', strategy='mean', axis=0)
data = im.fit_transform([[1, 2], [np.nan, 3], [7, 6]])
print(data)
def var():
""" 特征选择-删除低方差的特征 :return: None """
var = VarianceThreshold(threshold=0.1)
data = var.fit_transform([[0, 2, 0, 3], [3, 1, 4, 3], [4, 1, 1, 3]])
print(data)
return None
def pca():
""" 主成分分析进行特征选择 :return: """
# 特征数量达到上百的时候 考虑数据简化 数据内容也会变 特征数量减少
# 1.整数 减少到特征数量
# 2.小数 0-1 90% 90%-95%
pca = PCA(n_components=0.95)
data = pca.fit_transform([[2, 8, 4, 5], [6, 3, 0, 8], [5, 4, 9, 1]])
print(data)
pca()
边栏推荐
- Example 005: three numbers sorting input three integers x, y, Z, please output these three numbers from small to large.
- How can fresh students write resumes to attract HR and interviewers
- Program error record 1:valueerror: invalid literal for int() with base 10: '2.3‘
- STM32 --- serial port communication
- Guess riddles (2)
- Meizu Bluetooth remote control temperature and humidity access homeassistant
- [matlab] matlab reads and writes Excel
- STM32 lights up the 1.8-inch screen under Arduino IDE
- 实例003:完全平方数 一个整数,它加上100后是一个完全平方数,再加上168又是一个完全平方数,请问该数是多少?
- 關於線性穩壓器的五個設計細節
猜你喜欢

An enterprise information integration system

Illustration of eight classic pointer written test questions

UE pixel stream, come to a "diet pill"!

Lori remote control LEGO motor

MATLAB skills (28) Fuzzy Comprehensive Evaluation

Meizu Bluetooth remote control temperature and humidity access homeassistant

猜谜语啦(8)

Redis实现高性能的全文搜索引擎---RediSearch

如何写Cover Letter?

剑指 Offer 06. 从尾到头打印链表
随机推荐
Classification of plastic surgery: short in long long long
Business modeling of software model | overview
Arduino+a4988 control stepper motor
Arrangement of some library files
实例008:九九乘法表
Some pitfalls of win10 network sharing
Esphone Feixun DC1 soft change access homeassstant
猜谜语啦(6)
【日常訓練--騰訊精選50】557. 反轉字符串中的單詞 III
Example 007: copy data from one list to another list.
MATLAB skills (28) Fuzzy Comprehensive Evaluation
Basic number theory - fast power
ABC#237 C
Guess riddles (9)
Typescript hands-on tutorial, easy to understand
Warning: retrying occurs during PIP installation
猜谜语啦(9)
每日一题——输入一个日期,输出它是该年的第几天
[daily training -- Tencent selected 50] 557 Reverse word III in string
实例001:数字组合 有四个数字:1、2、3、4,能组成多少个互不相同且无重复数字的三位数?各是多少?