ICode9

精准搜索请尝试: 精确搜索
首页 > 编程语言> 文章详细

python数据分析之分类模型与回归模型-第七次笔记

2021-07-13 20:31:13  阅读:227  来源: 互联网

标签:数据分析 code python pred 模型 part train import sklearn


python数据分析之分类模型与回归模型-第七次笔记


1.分类模型

– *1.1KNN 算法
– *1.2朴素贝叶斯 算法
– *1.3支持向量机SVM 算法
– *1.4集成方法—随机森林算法
– *1.5集成方法—Adaboost 算法
– *1.6决策树

2.回归模型

– *2.1线性回归
– *2.2岭回归
– *2.3Lasso回归
– *2.4逻辑回归
– *2.5人工神经网络
– *2.6GBDT,回归树和提升树


提取数据

        #提取训练集,验证集,测试集   比例为6:2:2
        from  sklearn.model_selection import train_test_split
        f_v = features.values
        f_names = features.columns.values
        l_v = label.values
        X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)
        X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)
    

1.分类模型

1.1KNN 算法

    #导入模块
    from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
    models.append(("KNN",KNeighborsClassifier(n_neighbors=3)))
[/code]

####  1.2朴素贝叶斯 算法

```code
        from sklearn.naive_bayes import GaussianNB,BernoulliNB
        #朴素贝叶斯
        models.append(("GaussianNB",GaussianNB()))
        models.append(("BernoulliNB",BernoulliNB()))
[/code]

####  1.3支持向量机SVM 算法

```code
        from sklearn.svm import SVC
        # SVM 支持向量机  C参数控制精度
        models.append(("SVM Classifier",SVC(C=1000)))
[/code]

####  1.4集成方法—随机森林算法

```code
        from sklearn.ensemble import RandomForestClassifier
        #原始森林
      models.append(("OriginalRandomForest",RandomForestClassifier()))
        #随机森林
        models.append(("RandomForest",RandomForestClassifier(n_estimators=11,max_features=None)))
[/code]

####  1.5集成方法—Adaboost 算法

```code
        from sklearn.ensemble import AdaBoostClassifier
        #分类集成,Adaboost 方法:base_estimator=SVC(),n_estimators=100,algorithm="SAMME")))
        models.append(("Adaboost",AdaBoostClassifier(n_estimators=100)))
[/code]

####  1.6决策树

```code
        from sklearn.tree import DecisionTreeClassifier,export_graphviz
        #min_impurity_split=0.1 最小不纯度的区分,减枝方法
        #决策树(Gini)不纯度
        models.append(("DecisionTreeGini",DecisionTreeClassifier()))
        #决策树
        models.append(("DecisionTreeEntropy",DecisionTreeClassifier(criterion="entropy")))
[/code]

###  2.回归模型

####  2.1线性回归

```code
        #线性回归
        from sklearn.linear_model import LinearRegression,Ridge,Lasso
            #线性回归
        #regr=LinearRegression()
[/code]

####  2.2岭回归

```code
        #岭回归
        regr=Ridge(alpha=1)
[/code]

####  2.3Lasso回归

```code
        #Lasso
        regr=Lasso(alpha=0.001)

2.4逻辑回归

        #逻辑回归也是一种线性回归
        models.append(("LogisticRegression",LogisticRegression(C=1000,tol=1e-10,solver="sag",max_iter=10000)))

2.5人工神经网络

        #人工神经网络  人工神经网络的一个容器
        from keras.models import Sequential
        #Dense 神经网络层(稠密层)Activation激活函数
        from keras.layers.core import Dense,Activation
        #SGD 随机梯度下降算法
        from keras.optimizers import SGD
        #建个容器
        mdl=Sequential()
        #建个输入层,50指下一个层的神经元个数为50,intintput_dim表示输入的纬度
        mdl.add(Dense(50,input_dim=len(f_v[0])))
        #加入激活函数
        mdl.add(Activation("sigmoid"))
        #输出层:2:有两个标注所以为2,
        mdl.add(Dense(2))
        mdl.add(Activation("softmax"))
        #学习率为0.01
        sgd=SGD(lr=0.05)
        #参数 loss指的是最优化函数(损失函数)optimizer优化器 sgd优化器  adam亚当优化器
        mdl.compile(loss="mean_squared_error",optimizer="adam")
        #nb_epoch=迭代的次数,batch_size随机梯度下降算法,每次选取的个数
        mdl.fit(X_train,np.array([[0,1] if i ==1 else [1,0] for i in Y_train]),nb_epoch=1000,batch_size=8999)
            xy_lst = [(X_train, Y_train), (X_validation, Y_validation), (X_test, Y_test)]
        import matplotlib.pyplot as plt
        from sklearn.metrics import roc_curve,auc,roc_auc_score
        f=plt.figure()
    
        for i in range(len(xy_lst)):
            X_part = xy_lst[i][0]
            Y_part = xy_lst[i][1]
            #predict_classes()输出分类标注
            #Y_pred = mdl.predict_classes(X_part)
            Y_pred = mdl.predict(X_part)
            print(Y_pred)
            Y_pred=np.array(Y_pred[:,1]).reshape((1,-1))[0]
    
            # print(i)
            # print("NN", "-ACC", accuracy_score(Y_part, Y_pred))
            # print("NN", "-REC", recall_score(Y_part, Y_pred))
            # print("NN", "-Fl", f1_score(Y_part, Y_pred))
            f.add_subplot(1,3,i+1)
            fpr,tpr,threshold=roc_curve(Y_part,Y_pred)
            plt.plot(fpr,tpr)
            print("NN","AUC",auc(fpr,tpr))
            print("NN","AUC_Score",roc_auc_score(Y_part,Y_pred))
        plt.show()

2.6GBDT,回归树和提升树

        from sklearn.ensemble import GradientBoostingClassifier
        #GBDT,回归树和提升树  参数max_depth=6一般深度为6,n_estimators=树的数量
        models.append(("GBDT",GradientBoostingClassifier(max_depth=6,n_estimators=100)))
[/code]

###  模型的评估

```code
    #准确度,召回度, F-score度,为了评价模型的好坏。
        from sklearn.metrics import accuracy_score, recall_score, f1_score
        for clf_name ,clf in models:
            clf.fit(X_train,Y_train)
            xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
            for i in  range(len(xy_lst)):
                X_part=xy_lst[i][0]
                Y_part=xy_lst[i][1]
                Y_pred=clf.predict(X_part)
                print(i)
                print(clf_name,"-ACC",accuracy_score(Y_part,Y_pred))
                print(clf_name,"-REC",recall_score(Y_part,Y_pred))
                print(clf_name,"-Fl",f1_score(Y_part,Y_pred))
[/code]

###  完整的程序:

```code
    #encoding utf-8
    # time: 2018/08/08
    # name: py粉
    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import MinMaxScaler,StandardScaler
    from sklearn.preprocessing import LabelEncoder,OneHotEncoder
    from sklearn.preprocessing import Normalizer
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    from sklearn.decomposition import PCA
    import os
    import pydotplus
    os.environ["PATH"]+=os.pathsep+"E:/Program/Graphviz/bin/"
    
    
    #sl:satisfaction_level---Flase:MinMaxScaler;Ture:StandardScaler
    #le:last_evaluation---Flase:MinMaxScaler;Ture:StandardScaler
    #npr:number_project---Flase:MinMaxScaler;Ture:StandardScaler
    #amh:average_monthly_hours---Flase:MinMaxScaler;Ture:StandardScaler
    #tsc:time_spend_company---Flase:MinMaxScaler;Ture:StandardScaler
    #wa:Work_accident---Flase:MinMaxScaler;Ture:StandardScaler
    #pl5:promotion_last_5years---Flase:MinMaxScaler;Ture:StandardScaler
    #dp:department---False:LabelEncoding;True:OneHotEncoding
    #slr:salary---False:LabelEncoding;True:OneHotEncoding
    
    def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=False,slr=False,lower_d=False,ld_n=1):
        f = open("D:\Python\python'数据分析与建模实现\data\HR.csv")
        df = pd.read_csv(f)
    
        #1.清洗数据
        #satisfaction_level, last_evaluation, number_project,\
        #average_monthly_hours, time_spend_company, Work_accident,
        #  left, promotion_last_5years, department, salary
        df=df.dropna(subset=["satisfaction_level","last_evaluation"])
        df=df[df["satisfaction_level"]<=1][df["salary"]!="nme"]
        # 2.得到标注
        label = df["left"]
        df = df.drop("left", axis=1)
        #3.特征选取
        #4.特征处理
        scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
        column_lst=["satisfaction_level","last_evaluation","number_project",\
                    "average_monthly_hours","time_spend_company","Work_accident",\
                    "promotion_last_5years"]
        for i in range(len(scaler_lst)):
            if not scaler_lst[i]:
                df[column_lst[i]]=\
                MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
            else:
                df[column_lst[i]]=\
                StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    
    
    
        scaler_lst=[slr,dp]
        column_lst=[ "salary","department"]
        for i in range(len(scaler_lst)):
            if not scaler_lst[i]:
                if column_lst[i]=="salary":
                    df[column_lst[i]]=[map_salary(s) for s in df["salary"].values]
                else:
                    df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])
                df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1)).reshape(1, -1)[0]
            else:
                # pandas提供了一个OneHotEncoding的方法
                df=pd.get_dummies(df,columns=[column_lst[i]])
        if lower_d:
            return PCA(n_components=ld_n).fit_transform(df.values),label
    
    
        return df,label
    #把“salary”的值标签化
    d=dict([("low",0),("medium",1),("high",2)])
    def map_salary(s):
        return d.get(s,0)
    def hr_modeling(features,label):
        #提取训练集,验证集,测试集   比例为6:2:2
        from  sklearn.model_selection import train_test_split
        f_v = features.values
        f_names = features.columns.values
        l_v = label.values
        X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)
        X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)
    
    
        #models
        from sklearn.metrics import accuracy_score, recall_score, f1_score
        from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
        from sklearn.naive_bayes import GaussianNB,BernoulliNB
        from sklearn.tree import DecisionTreeClassifier,export_graphviz
        from sklearn.externals.six import StringIO
        from sklearn.svm import SVC
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.ensemble import AdaBoostClassifier
        #逻辑回归
        from sklearn.linear_model import LogisticRegression
        from sklearn.ensemble import GradientBoostingClassifier
    
    
    
        #人工神经网络  人工神经网络的一个容器
        from keras.models import Sequential
        #Dense 神经网络层(稠密层)Activation激活函数
        from keras.layers.core import Dense,Activation
        #SGD 随机梯度下降算法
        from keras.optimizers import SGD
        #建个容器
        mdl=Sequential()
        #建个输入层,50指下一个层的神经元个数为50,intintput_dim表示输入的纬度
        mdl.add(Dense(50,input_dim=len(f_v[0])))
        #加入激活函数
        mdl.add(Activation("sigmoid"))
        #输出层:2:有两个标注所以为2,
        mdl.add(Dense(2))
        mdl.add(Activation("softmax"))
        #学习率为0.01
        sgd=SGD(lr=0.05)
        #参数 loss指的是最优化函数(损失函数)optimizer优化器 sgd优化器  adam亚当优化器
        mdl.compile(loss="mean_squared_error",optimizer="adam")
        #nb_epoch=迭代的次数,batch_size随机梯度下降算法,每次选取的个数
        mdl.fit(X_train,np.array([[0,1] if i ==1 else [1,0] for i in Y_train]),nb_epoch=1000,batch_size=8999)
        xy_lst = [(X_train, Y_train), (X_validation, Y_validation), (X_test, Y_test)]
        import matplotlib.pyplot as plt
        from sklearn.metrics import roc_curve,auc,roc_auc_score
        f=plt.figure()
    
        for i in range(len(xy_lst)):
            X_part = xy_lst[i][0]
            Y_part = xy_lst[i][1]
            #predict_classes()输出分类标注
            #Y_pred = mdl.predict_classes(X_part)
            Y_pred = mdl.predict(X_part)
            print(Y_pred)
            Y_pred=np.array(Y_pred[:,1]).reshape((1,-1))[0]
    
            # print(i)
            # print("NN", "-ACC", accuracy_score(Y_part, Y_pred))
            # print("NN", "-REC", recall_score(Y_part, Y_pred))
            # print("NN", "-Fl", f1_score(Y_part, Y_pred))
            f.add_subplot(1,3,i+1)
            fpr,tpr,threshold=roc_curve(Y_part,Y_pred)
            plt.plot(fpr,tpr)
            print("NN","AUC",auc(fpr,tpr))
            print("NN","AUC_Score",roc_auc_score(Y_part,Y_pred))
        plt.show()
    
    
    
        return
        models=[]
        models.append(("KNN",KNeighborsClassifier(n_neighbors=3)))
        #朴素贝叶斯
        models.append(("GaussianNB",GaussianNB()))
        models.append(("BernoulliNB",BernoulliNB()))
        #min_impurity_split=0.1 最小不纯度的区分,减枝方法
        #决策树(Gini)不纯度
        models.append(("DecisionTreeGini",DecisionTreeClassifier()))
        #决策树
        models.append(("DecisionTreeEntropy",DecisionTreeClassifier(criterion="entropy")))
        # SVM 支持向量机  C参数控制精度
        models.append(("SVM Classifier",SVC(C=1000)))
        #原始森林
        models.append(("OriginalRandomForest",RandomForestClassifier()))
        #随机森林
        models.append(("RandomForest",RandomForestClassifier(n_estimators=11,max_features=None)))
        #分类集成,Adaboost 方法:base_estimator=SVC(),n_estimators=100,algorithm="SAMME")))
        models.append(("Adaboost",AdaBoostClassifier(n_estimators=100)))
        #逻辑回归也是一种线性回归
        models.append(("LogisticRegression",LogisticRegression(C=1000,tol=1e-10,solver="sag",max_iter=10000)))
        #GBDT,回归树和提升树  参数max_depth=6一般深度为6,n_estimators=树的数量
        models.append(("GBDT",GradientBoostingClassifier(max_depth=6,n_estimators=100)))
        for clf_name ,clf in models:
            clf.fit(X_train,Y_train)
            xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
            for i in  range(len(xy_lst)):
                X_part=xy_lst[i][0]
                Y_part=xy_lst[i][1]
                Y_pred=clf.predict(X_part)
                print(i)
                print(clf_name,"-ACC",accuracy_score(Y_part,Y_pred))
                print(clf_name,"-REC",recall_score(Y_part,Y_pred))
                print(clf_name,"-Fl",f1_score(Y_part,Y_pred))
                #绘制决策树
                #dot_data=StringIO()
                #export_graphviz(clf,out_file=dot_data,
                #                         feature_names=f_names,
                #                         class_names=["NL","L"],
                #                         filled=True,
                #                         rounded=True,
                #                         special_characters=True)
                #graph=pydotplus.graph_from_dot_data(dot_data.getvalue())
                #graph.write_pdf("dt_tree_2.pdf")
    
    def regr_test(features,label):
        print("X",features)
        print("Y",label)
        #线性回归
        from sklearn.linear_model import LinearRegression,Ridge,Lasso
        #线性回归
        #regr=LinearRegression()
        #岭回归
        regr=Ridge(alpha=1)
        #Lasso
        #regr=Lasso(alpha=0.001)
    
        regr.fit(features.values,label.values)
        Y_pred=regr.predict(features.values)
        print("Coef:",regr.coef_)
        from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
        print("MSE:",mean_squared_error(label.values,Y_pred))
        print("MAE:",mean_absolute_error(label.values,Y_pred))
        print("R2:",r2_score(label.values,Y_pred))
    
    def main():
        #数据处理,特征处理
        features,label=hr_preprocessing()
        #线性回归
        regr_test(features[["number_project","average_monthly_hours"]],features["last_evaluation"])
        #分类与集成
        #hr_modeling(features, label)
    
    if __name__ == '__main__':
        main()

在这里插入图片描述

标签:数据分析,code,python,pred,模型,part,train,import,sklearn
来源: https://blog.csdn.net/wx1871428/article/details/118710030

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有