2022-07-05 08:42:00 【python-码博士】
import numpy as np
import jieba
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Imputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
def dictvec():
""" 字典数据抽取 :return: """
# 实例化
dict1 = DictVectorizer()
# 调用fit_transform
data = dict1.fit_transform(
'city': '北京', 'temperature': 100}, {
'city': '上海', 'temperature': 60}, {
'city': '深圳', 'temperature': 30}])
print(data) # sparse 矩阵 scipy 基于numpy 节约内存 方便读取处理
print(data.toarray()) # 二维数组 numpy array数组
def countvec():
""" 对于文本进行特征值化 :return:None """
# 实例化CountVectorizer
cv = CountVectorizer()
# 调用fit_transform输入转换数据
data = cv.fit_transform(["人生 苦短,我 用 python", "人生 漫长,不用 python"])
# 打印
print(data) # sparse矩阵 scipy 基于numpy 节约内存,方便读取处理
print(data.toarray()) # 二维数组 numpy
return None
def cutword():
""" 分词 :return: """
con1 = jieba.cut("今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。")
con2 = jieba.cut("我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去。")
con3 = jieba.cut("如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。")
# 转换为列表
content1 = list(con1)
content2 = list(con2)
content3 = list(con3)
# 把列表转换为字符串
c1 = ' '.join(content1)
c2 = ' '.join(content2)
c3 = ' '.join(content3)
return c1, c2, c3
def hanzivec():
""" 中文特征值化 :return:None """
c1, c2, c3 = cutword()
print(c1, c2, c3)
# 实例化CountVectorizer
cv = CountVectorizer()
# 调用fit_transform输入转换数据
data = cv.fit_transform([c1, c2, c3])
# 打印
print(data) # sparse矩阵 scipy 基于numpy 节约内存,方便读取处理
print(data.toarray()) # 二维数组 numpy
return None
def tfidfvec():
""" 中文特征值化 :return: """
c1, c2, c3 = cutword()
print(c1, c2, c3)
# 实例化
tf = TfidfVectorizer()
data = tf.fit_transform([c1, c2, c3])
def mm():
""" 归一化处理 :return: """
mm = MinMaxScaler(feature_range=(3, 5))
data = mm.fit_transform([[90, 2, 10, 40], [60, 4, 15, 45], [75, 3, 13, 46]])
return None
def stand():
""" 标准化处理 :return: """
std = StandardScaler()
data = std.fit_transform([[1., -1., 3.], [2., 4., 2.], [4., 6., -1.]])
return None
def im():
""" 缺失值处理 :return:None """
# NaN nan
im = Imputer(missing_values='NaN', strategy='mean', axis=0)
data = im.fit_transform([[1, 2], [np.nan, 3], [7, 6]])
def var():
""" 特征选择-删除低方差的特征 :return: None """
var = VarianceThreshold(threshold=0.1)
data = var.fit_transform([[0, 2, 0, 3], [3, 1, 4, 3], [4, 1, 1, 3]])
return None
def pca():
""" 主成分分析进行特征选择 :return: """
# 特征数量达到上百的时候 考虑数据简化 数据内容也会变 特征数量减少
# 1.整数 减少到特征数量
# 2.小数 0-1 90% 90%-95%
pca = PCA(n_components=0.95)
data = pca.fit_transform([[2, 8, 4, 5], [6, 3, 0, 8], [5, 4, 9, 1]])
- How apaas is applied in different organizational structures
- [matlab] matlab reads and writes Excel
- Typical low code apaas manufacturer cases
- Sword finger offer 05 Replace spaces
- My university
- Matlab tips (28) fuzzy comprehensive evaluation
- Basic number theory - fast power
- Arduino burning program and Arduino burning bootloader
- Example 007: copy data from one list to another list.
- STM32 --- serial port communication
Bluebridge cup internet of things basic graphic tutorial - GPIO output control LD5 on and off
Reasons for the insecurity of C language standard function scanf
Bluebridge cup internet of things basic graphic tutorial - GPIO input key control LD5 on and off
Numpy 小坑:维度 (n, 1) 和 维度 (n, ) 数组相加运算后维度变为 (n, n)
287. Looking for repeats - fast and slow pointer
How apaas is applied in different organizational structures
【日常训练--腾讯精选50】557. 反转字符串中的单词 III
C# LINQ源码分析之Count
Run menu analysis
[牛客网刷题 Day4] JZ35 复杂链表的复制
Task failed task_ 1641530057069_ 0002_ m_ 000000
Guess riddles (10)
Business modeling of software model | stakeholders
Business modeling of software model | object modeling
Lori remote control LEGO motor