当前位置:网站首页>Characteristic Engineering

Characteristic Engineering

2022-07-05 08:50:00 Python code doctor

import numpy as np
import jieba
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Imputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA


def dictvec():
    """  Dictionary data extraction  :return: """
    #  Instantiation 
    dict1 = DictVectorizer()
    #  call fit_transform
    data = dict1.fit_transform(
        [{
    'city': ' Beijing ', 'temperature': 100}, {
    'city': ' Shanghai ', 'temperature': 60}, {
    'city': ' Shenzhen ', 'temperature': 30}])
    print(data)  # sparse  matrix  scipy  be based on numpy  To save memory   Easy to read and process 
    print(dict1.get_feature_names())
    print(data.toarray())  #  Two dimensional array  numpy array Array 


def countvec():
    """  Text is characterized  :return:None """
    #  Instantiation CountVectorizer
    cv = CountVectorizer()
    #  call fit_transform Input conversion data 
    data = cv.fit_transform([" life   Bitterness is short , I   use  python", " life   Very long , no need  python"])

    #  Print 
    print(cv.get_feature_names())
    print(data)  # sparse matrix  scipy  be based on numpy  To save memory , Easy to read and process 
    print(data.toarray())  #  Two dimensional array  numpy

    return None


def cutword():
    """  participle  :return: """
    con1 = jieba.cut(" It's been a brutal day. , Tomorrow will be more cruel. , The day after tomorrow is beautiful , But most of them will die tomorrow night , So everyone don't give up today .")
    con2 = jieba.cut(" The light we see coming from distant galaxies was emitted millions of years ago , So when we see the universe , We are looking at its past .")
    con3 = jieba.cut(" If you only know something in one way , You don't really understand it . The secret of understanding the true meaning of things depends on how we relate them to what we know .")

    #  Convert to list 
    content1 = list(con1)
    content2 = list(con2)
    content3 = list(con3)

    #  Convert list to string 
    c1 = ' '.join(content1)
    c2 = ' '.join(content2)
    c3 = ' '.join(content3)

    return c1, c2, c3


def hanzivec():
    """  Chinese eigenvalue  :return:None """
    c1, c2, c3 = cutword()
    print(c1, c2, c3)
    #  Instantiation CountVectorizer
    cv = CountVectorizer()
    #  call fit_transform Input conversion data 
    data = cv.fit_transform([c1, c2, c3])

    #  Print 
    print(cv.get_feature_names())
    print(data)  # sparse matrix  scipy  be based on numpy  To save memory , Easy to read and process 
    print(data.toarray())  #  Two dimensional array  numpy

    return None


def tfidfvec():
    """  Chinese eigenvalue  :return: """
    c1, c2, c3 = cutword()
    print(c1, c2, c3)
    #  Instantiation 
    tf = TfidfVectorizer()
    data = tf.fit_transform([c1, c2, c3])
    print(data)
    print(tf.get_feature_names())
    print(data.toarray())


def mm():
    """  normalization  :return: """
    mm = MinMaxScaler(feature_range=(3, 5))
    data = mm.fit_transform([[90, 2, 10, 40], [60, 4, 15, 45], [75, 3, 13, 46]])
    print(data)

    return None


def stand():
    """  Standardized treatment  :return: """
    std = StandardScaler()

    data = std.fit_transform([[1., -1., 3.], [2., 4., 2.], [4., 6., -1.]])

    print(data)

    return None


def im():
    """  Missing value processing  :return:None """
    # NaN nan
    im = Imputer(missing_values='NaN', strategy='mean', axis=0)
    data = im.fit_transform([[1, 2], [np.nan, 3], [7, 6]])

    print(data)


def var():
    """  feature selection - Remove low variance features  :return: None """
    var = VarianceThreshold(threshold=0.1)

    data = var.fit_transform([[0, 2, 0, 3], [3, 1, 4, 3], [4, 1, 1, 3]])

    print(data)
    return None


def pca():
    """  Principal component analysis for feature selection  :return: """
    #  When the number of features reaches hundreds   Consider data simplification   The data content will also change   The number of features decreases 
    # 1. Integers   Reduce to the number of features 
    # 2. decimal  0-1 90% 90%-95%
    pca = PCA(n_components=0.95)
    data = pca.fit_transform([[2, 8, 4, 5], [6, 3, 0, 8], [5, 4, 9, 1]])

    print(data)


pca()
原网站

版权声明
本文为[Python code doctor]所创,转载请带上原文链接,感谢
https://yzsam.com/2022/186/202207050841505868.html