当前位置 博文首页 > boysoft2002的专栏:南大《探索数据的奥秘》课件示例代码笔记18

    boysoft2002的专栏:南大《探索数据的奥秘》课件示例代码笔记18

    作者:[db:作者] 时间:2021-06-11 21:16

    Chp8-4
    2019 年 12 月 23 日

    In [21]: import numpy as np
    import pandas as pd
    data1=np.random.rand(1000) #[0,1] 均匀分布的随机数
    data2=np.random.rand(1000)
    data3=np.random.rand(1000)
    data4=np.random.rand(1000)
    data5=np.random.rand(1000)
    pd.DataFrame(data1).hist(bins=10)
    print('data1 的 1000 个数中,有',(data1>0.5).sum(),'个数据是大于 0.5 的')
    print('data1 的 1000 个数中,有',(data1>0.3).sum(),'个数据是大于 0.3 的')
    data1 的 1000 个数中,有 499 个数据是大于 0.5 的
    data1 的 1000 个数中,有 713 个数据是大于 0.3 的

    In [22]: # 大于 0.3 就预测 1,否则预测 0, 假设真实值全 1,则预测的 accuracy=0.7
    model1=np.where(data1>0.3,1,0)
    model2=np.where(data2>0.3,1,0)
    model3=np.where(data3>0.3,1,0)
    model4=np.where(data4>0.3,1,0)
    model5=np.where(data5>0.3,1,0)
    # 均值数学上相当于预测 1 占所有样本的比例,相当于预测的 accuracy
    print('第一个模型的 accuracy 是: ',model1.mean())
    print('第二个模型的 accuracy 是: ',model2.mean())
    print('第三个模型的 accuracy 是: ',model3.mean())
    print('第四个模型的 accuracy 是: ',model4.mean())
    print('第五个模型的 accuracy 是: ',model5.mean())
    第一个模型的 accuracy 是: 0.713
    第二个模型的 accuracy 是: 0.721
    第三个模型的 accuracy 是: 0.687
    第四个模型的 accuracy 是: 0.671
    第五个模型的 accuracy 是: 0.72
    
    In [11]: # 相当于 5 个预测模型累加平均
    ensemble_preds=np.round((model1+model2+model3+model4+model5)/5.0).astype(int)
    print('集成模型的 accuracy 是: ',ensemble_preds.mean())
    集成模型的 accuracy 是: 0.839
    
    In [23]: # 大于 0.3 就预测 1,否则预测 0, 假设真实值全 1,则预测的 accuracy=0.7
    model1=np.where(data1>0.7,1,0)
    model2=np.where(data2>0.7,1,0)
    model3=np.where(data3>0.7,1,0)
    model4=np.where(data4>0.7,1,0)
    model5=np.where(data5>0.7,1,0)
    # 均值数学上相当于预测 1 占所有样本的比例,相当于预测的 accuracy
    print('第一个模型的 accuracy 是: ',model1.mean())
    print('第二个模型的 accuracy 是: ',model2.mean())
    print('第三个模型的 accuracy 是: ',model3.mean())
    print('第四个模型的 accuracy 是: ',model4.mean())
    print('第五个模型的 accuracy 是: ',model5.mean())
    # 相当于 5 个预测模型累加平均
    ensemble_preds=np.round((model1+model2+model3+model4+model5)/5.0).astype(int)
    print('集成模型的 accuracy 是: ',ensemble_preds.mean())
    第一个模型的 accuracy 是: 0.305
    第二个模型的 accuracy 是: 0.319
    第三个模型的 accuracy 是: 0.285
    第四个模型的 accuracy 是: 0.291
    第五个模型的 accuracy 是: 0.319
    集成模型的 accuracy 是: 0.178
    
    In [24]: # 大于 0.3 就预测 1,否则预测 0, 假设真实值全 1,则预测的 accuracy=0.7
    model1=np.where(data1>0.7,1,0)
    model2=np.where(data2>0.3,1,0)
    model3=np.where(data3>0.6,1,0)
    model4=np.where(data4>0.2,1,0)
    model5=np.where(data5>0.5,1,0)
    # 均值数学上相当于预测 1 占所有样本的比例,相当于预测的 accuracy
    print('第一个模型的 accuracy 是: ',model1.mean())
    print('第二个模型的 accuracy 是: ',model2.mean())
    print('第三个模型的 accuracy 是: ',model3.mean())
    print('第四个模型的 accuracy 是: ',model4.mean())
    print('第五个模型的 accuracy 是: ',model5.mean())
    # 相当于 5 个预测模型累加平均
    ensemble_preds=np.round((model1+model2+model3+model4+model5)/5.0).astype(int)
    print('集成模型的 accuracy 是: ',ensemble_preds.mean())
    第一个模型的 accuracy 是: 0.305
    第二个模型的 accuracy 是: 0.721
    第三个模型的 accuracy 是: 0.383
    第四个模型的 accuracy 是: 0.778
    第五个模型的 accuracy 是: 0.512
    集成模型的 accuracy 是: 0.571
    
    In [1]: import pandas as pd
    import numpy as np
    from scipy import stats
    from matplotlib import pyplot as plt
    my_data = pd.read_csv("C:\Python\Scripts\my_data\german_credit_data_dataset.csv"
    )#,dtype=str)
    hah=my_data[['customer_type']]-1
    print(hah.sum())
    customer_type 300
    dtype: int64
    
    In [2]: #from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeClassifier
    4from sklearn.model_selection import cross_val_score
    from sklearn.ensemble import RandomForestClassifier
    feature_col=my_data.columns
    X=my_data[['duration']] #
    for n,my_str in enumerate(feature_col):
    if (my_str!='customer_type') & (my_str != 'duration'):
    if my_data[[my_str]].
    dtypes[0]!=object:
    X=pd.concat([X,my_data[[my_str]]],axis=1)
    for n,my_str in enumerate(feature_col):
    if my_data[[my_str]].dtypes[0] == object:
    my_dummy=pd.get_dummies(my_data[[my_str]],prefix=my_str)
    X=pd.concat([X,my_dummy],axis=1)
    print(X.info())
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 1000 entries, 0 to 999
    Data columns (total 61 columns):
    duration 1000 non-null int64
    credit_amount 1000 non-null float64
    installment_rate 1000 non-null float64
    present_residence 1000 non-null float64
    age 1000 non-null float64
    existing_credits 1000 non-null float64
    dependents 1000 non-null int64
    checking_account_status_A11 1000 non-null uint8
    checking_account_status_A12 1000 non-null uint8
    checking_account_status_A13 1000 non-null uint8
    checking_account_status_A14 1000 non-null uint8
    credit_history_A30 1000 non-null uint8
    credit_history_A31 1000 non-null uint8
    credit_history_A32 1000 non-null uint8
    credit_history_A33 1000 non-null uint8
    credit_history_A34 1000 non-null uint8
    purpose_A40 1000 non-null uint8
    purpose_A41 1000 non-null uint8
    purpose_A410 1000 non-null uint8
    purpose_A42 1000 non-null uint8
    purpose_A43 1000 non-null uint8
    purpose_A44 1000 non-null uint8
    purpose_A45 1000 non-null uint8
    purpose_A46 1000 non-null uint8
    purpose_A48 1000 non-null uint8
    purpose_A49 1000 non-null uint8
    savings_A61 1000 non-null uint8
    savings_A62 1000 non-null uint8
    savings_A63 1000 non-null uint8
    savings_A64 1000 non-null uint8
    savings_A65 1000 non-null uint8
    present_employment_A71 1000 non-null uint8
    present_employment_A72 1000 non-null uint8
    present_employment_A73 1000 non-null uint8
    present_employment_A74 1000 non-null uint8
    present_employment_A75 1000 non-null uint8
    personal_A91 1000 non-null uint8
    personal_A92 1000 non-null uint8
    personal_A93 1000 non-null uint8
    personal_A94 1000 non-null uint8
    other_debtors_A101 1000 non-null uint8
    other_debtors_A102 1000 non-null uint8
    other_debtors_A103 1000 non-null uint8
    property_A121 1000 non-null uint8
    property_A122 1000 non-null uint8
    property_A123 1000 non-null uint8
    property_A124 1000 non-null uint8
    other_installment_plans_A141 1000 non-null uint8
    other_installment_plans_A142 1000 non-null uint8
    other_installment_plans_A143 1000 non-null uint8
    housing_A151 1000 non-null uint8
    housing_A152 1000 non-null uint8
    housing_A153 1000 non-null uint8
    job_A171 1000 non-null uint8
    job_A172 1000 non-null uint8
    job_A173 1000 non-null uint8
    job_A174 1000 non-null uint8
    telephone_A191 1000 non-null uint8
    telephone_A192 1000 non-null uint8
    foreign_worker_A201 1000 non-null uint8
    foreign_worker_A202 1000 non-null uint8
    dtypes: float64(5), int64(2), uint8(54)
    memory usage: 107.5 KB
    None
    
    In [3]: estimator_range=range(10,400,10)
    my_scores=[]
    for estimator in estimator_range:
    my_tree=RandomForestClassifier(n_estimators=estimator)
    accuracy_scores=cross_val_score(my_tree,X,my_data['customer_type'],
    cv=5,scoring='roc_auc')
    my_scores.append(accuracy_scores.mean())
    In [4]: plt.plot(estimator_range,my_scores)
    plt.xlabel('the number of trees')
    plt.ylabel('ROC_AUC')
    Out[4]: Text(0,0.5,'ROC_AUC')

    In [5]: my_RF=RandomForestClassifier(n_estimators=150)
    my_RF.fit(X,my_data['customer_type'])
    pd.DataFrame({'feature':X.columns,
    'importance':my_RF.feature_importances_}).sort_values('importance',
    ascending=False)
    Out[5]: feature importance
    1 credit_amount 0.102241
    4 age 0.077722
    0 duration 0.077652
    10 checking_account_status_A14 0.047908
    7 checking_account_status_A11 0.039347
    3 present_residence 0.034465
    2 installment_rate 0.033480
    15 credit_history_A34 0.021523
    26 savings_A61 0.019369
    5 existing_credits 0.017395
    8 checking_account_status_A12 0.017350
    43 property_A121 0.017015
    49 other_installment_plans_A143 0.016411
    20 purpose_A43 0.016216
    16 purpose_A40 0.016094
    37 personal_A92 0.015763
    55 job_A173 0.015198
    33 present_employment_A73 0.015030
    51 housing_A152 0.014993
    13 credit_history_A32 0.014474
    32 present_employment_A72 0.014454
    30 savings_A65 0.014404
    38 personal_A93 0.014346
    19 purpose_A42 0.014143
    56 job_A174 0.014014
    45 property_A123 0.013968
    44 property_A122 0.013528
    47 other_installment_plans_A141 0.013023
    57 telephone_A191 0.012869
    35 present_employment_A75 0.012681
    .. ... ...
    12 credit_history_A31 0.011835
    6 dependents 0.011585
    11 credit_history_A30 0.011464
    50 housing_A151 0.010970
    46 property_A124 0.010738
    54 job_A172 0.010724
    34 present_employment_A74 0.010696
    17 purpose_A41 0.009589
    25 purpose_A49 0.009458
    40 other_debtors_A101 0.009290
    31 present_employment_A71 0.009159
    14 credit_history_A33 0.008633
    27 savings_A62 0.008608
    23 purpose_A46 0.008341
    39 personal_A94 0.008258
    9 checking_account_status_A13 0.008238
    48 other_installment_plans_A142 0.007623
    42 other_debtors_A103 0.007613
    36 personal_A91 0.007098
    41 other_debtors_A102 0.006547
    52 housing_A153 0.006518
    28 savings_A63 0.005626
    29 savings_A64 0.005101
    22 purpose_A45 0.004062
    60 foreign_worker_A202 0.003173
    59 foreign_worker_A201 0.002837
    53 job_A171 0.002301
    21 purpose_A44 0.001803
    18 purpose_A410 0.001542
    24 purpose_A48 0.001007
    [61 rows x 2 columns]
    
    In [6]: print(my_RF)
    RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
    max_depth=None, max_features='auto', max_leaf_nodes=None,
    min_impurity_decrease=0.0, min_impurity_split=None,
    min_samples_leaf=1, min_samples_split=2,
    min_weight_fraction_leaf=0.0, n_estimators=150,
    n_jobs=None, oob_score=False, random_state=None,
    verbose=0, warm_start=False)

    ?

    下一篇:没有了