当前位置 博文首页 > boysoft2002的专栏:南大《探索数据的奥秘》课件示例代码笔记13

    boysoft2002的专栏:南大《探索数据的奥秘》课件示例代码笔记13

    作者:[db:作者] 时间:2021-06-11 21:17

    Chp7-3
    2019 年 12 月 23 日?

    In [21]: import pandas as pd
    import numpy as np
    from scipy import stats
    from matplotlib import pyplot as plt
    my_data = pd.read_csv("C:\Python\Scripts\my_data\german_credit_data_dataset.csv
    ")#,dtype=str)
    print(my_data.info())
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 1000 entries, 0 to 999
    Data columns (total 21 columns):
    checking_account_status 1000 non-null object
    duration 1000 non-null int64
    credit_history 1000 non-null object
    purpose 1000 non-null object
    credit_amount 1000 non-null float64
    savings 1000 non-null object
    present_employment 1000 non-null object
    installment_rate 1000 non-null float64
    personal 1000 non-null object
    other_debtors 1000 non-null object
    present_residence 1000 non-null float64
    property 1000 non-null object
    age 1000 non-null float64
    other_installment_plans 1000 non-null object
    housing 1000 non-null object
    existing_credits 1000 non-null float64
    job 1000 non-null object
    dependents 1000 non-null int64
    telephone 1000 non-null object
    foreign_worker 1000 non-null object
    customer_type 1000 non-null int64
    dtypes: float64(5), int64(3), object(13)
    memory usage: 164.1+ KB
    None
    
    In [52]: from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeClassifier
    feature_col=['checking_account_status','personal']
    X=my_data[['customer_type','credit_amount']] #
    for n,my_str in enumerate(feature_col):
    my_dummy=pd.get_dummies(my_data[[my_str]],prefix=my_str)
    X=pd.concat([X,my_dummy],axis=1)
    XX_feature=['credit_amount','checking_account_status_A14','personal_A91',
    'personal_A92','personal_A93','personal_A94']
    XX=X[XX_feature]
    Y=X['customer_type']
    X_train,X_test,Y_train,Y_test=train_test_split(XX,Y,test_size=0.2,random_state=0)
    my_tree=DecisionTreeClassifier(max_depth=3)
    my_tree.fit(X_train,Y_train)
    print('分类结果为: ',my_tree.predict(X_test),'\n')
    print('平均准确率为: ',my_tree.score(X_test,Y_test))
    分类结果为: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1
    1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
    1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
    1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 2
    1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
    1 1 1 1 1 1 2 1 1 1 1 1 1 1 1]
    平均准确率为: 0.71
    
    In [54]: pd.DataFrame({'feature':XX.columns,'importance':my_tree.feature_importances_})
    Out[54]: feature importance
    0 credit_amount 0.314532
    1 checking_account_status_A14 0.671787
    2 personal_A91 0.013680
    3 personal_A92 0.000000
    4 personal_A93 0.000000
    5 personal_A94 0.000000
    
    In [55]: from sklearn import tree
    import matplotlib.pyplot as plt
    plt.figure(figsize=(18,12))
    tree.plot_tree(my_tree,fontsize=12,feature_names=XX.columns,class_names=['Good','Bad
    '])
    plt.savefig('my_tree')

    ?