当前位置 博文首页 > boysoft2002的专栏:南大《探索数据的奥秘》课件示例代码笔记12

    boysoft2002的专栏:南大《探索数据的奥秘》课件示例代码笔记12

    作者:[db:作者] 时间:2021-06-11 21:17

    Chp7-2
    2019 年 12 月 23 日

    In [9]: import pandas as pd
    import numpy as np
    from scipy import stats
    import matplotlib.pyplot as plt
    from sklearn import datasets
    iris=datasets.load_iris()
    print(type(iris.data))
    print(iris.data)
    #print(type(iris.target))
    #print(iris.target)
    <class 'numpy.ndarray'>
    [[5.1 3.5 1.4 0.2]
    [4.9 3. 1.4 0.2]
    [4.7 3.2 1.3 0.2]
    [4.6 3.1 1.5 0.2]
    [5. 3.6 1.4 0.2]
    [5.4 3.9 1.7 0.4]
    [4.6 3.4 1.4 0.3]
    [5. 3.4 1.5 0.2]
    [4.4 2.9 1.4 0.2]
    [4.9 3.1 1.5 0.1]
    [5.4 3.7 1.5 0.2]
    [4.8 3.4 1.6 0.2]
    [4.8 3. 1.4 0.1]
    [4.3 3. 1.1 0.1]
    [5.8 4. 1.2 0.2]
    [5.7 4.4 1.5 0.4]
    [5.4 3.9 1.3 0.4]
    [5.1 3.5 1.4 0.3]
    [5.7 3.8 1.7 0.3]
    [5.1 3.8 1.5 0.3]
    [5.4 3.4 1.7 0.2]
    [5.1 3.7 1.5 0.4]
    [4.6 3.6 1. 0.2]
    [5.1 3.3 1.7 0.5]
    [4.8 3.4 1.9 0.2]
    [5. 3. 1.6 0.2]
    [5. 3.4 1.6 0.4]
    [5.2 3.5 1.5 0.2]
    [5.2 3.4 1.4 0.2]
    [4.7 3.2 1.6 0.2]
    [4.8 3.1 1.6 0.2]
    [5.4 3.4 1.5 0.4]
    [5.2 4.1 1.5 0.1]
    [5.5 4.2 1.4 0.2]
    [4.9 3.1 1.5 0.1]
    [5. 3.2 1.2 0.2]
    [5.5 3.5 1.3 0.2]
    [4.9 3.1 1.5 0.1]
    [4.4 3. 1.3 0.2]
    [5.1 3.4 1.5 0.2]
    [5. 3.5 1.3 0.3]
    [4.5 2.3 1.3 0.3]
    [4.4 3.2 1.3 0.2]
    [5. 3.5 1.6 0.6]
    [5.1 3.8 1.9 0.4]
    [4.8 3. 1.4 0.3]
    [5.1 3.8 1.6 0.2]
    [4.6 3.2 1.4 0.2]
    [5.3 3.7 1.5 0.2]
    [5. 3.3 1.4 0.2]
    [7. 3.2 4.7 1.4]
    [6.4 3.2 4.5 1.5]
    [6.9 3.1 4.9 1.5]
    [5.5 2.3 4. 1.3]
    [6.5 2.8 4.6 1.5]
    [5.7 2.8 4.5 1.3]
    [6.3 3.3 4.7 1.6]
    [4.9 2.4 3.3 1. ]
    [6.6 2.9 4.6 1.3]
    [5.2 2.7 3.9 1.4]
    [5. 2. 3.5 1. ]
    [5.9 3. 4.2 1.5]
    [6. 2.2 4. 1. ]
    [6.1 2.9 4.7 1.4]
    [5.6 2.9 3.6 1.3]
    [6.7 3.1 4.4 1.4]
    [5.6 3. 4.5 1.5]
    [5.8 2.7 4.1 1. ]
    [6.2 2.2 4.5 1.5]
    [5.6 2.5 3.9 1.1]
    [5.9 3.2 4.8 1.8]
    [6.1 2.8 4. 1.3]
    [6.3 2.5 4.9 1.5]
    [6.1 2.8 4.7 1.2]
    [6.4 2.9 4.3 1.3]
    [6.6 3. 4.4 1.4]
    [6.8 2.8 4.8 1.4]
    [6.7 3. 5. 1.7]
    [6. 2.9 4.5 1.5]
    [5.7 2.6 3.5 1. ]
    [5.5 2.4 3.8 1.1]
    [5.5 2.4 3.7 1. ]
    [5.8 2.7 3.9 1.2]
    [6. 2.7 5.1 1.6]
    [5.4 3. 4.5 1.5]
    [6. 3.4 4.5 1.6]
    [6.7 3.1 4.7 1.5]
    [6.3 2.3 4.4 1.3]
    [5.6 3. 4.1 1.3]
    [5.5 2.5 4. 1.3]
    [5.5 2.6 4.4 1.2]
    [6.1 3. 4.6 1.4]
    [5.8 2.6 4. 1.2]
    [5. 2.3 3.3 1. ]
    [5.6 2.7 4.2 1.3]
    [5.7 3. 4.2 1.2]
    [5.7 2.9 4.2 1.3]
    [6.2 2.9 4.3 1.3]
    [5.1 2.5 3. 1.1]
    [5.7 2.8 4.1 1.3]
    [6.3 3.3 6. 2.5]
    [5.8 2.7 5.1 1.9]
    [7.1 3. 5.9 2.1]
    [6.3 2.9 5.6 1.8]
    [6.5 3. 5.8 2.2]
    [7.6 3. 6.6 2.1]
    [4.9 2.5 4.5 1.7]
    [7.3 2.9 6.3 1.8]
    [6.7 2.5 5.8 1.8]
    [7.2 3.6 6.1 2.5]
    [6.5 3.2 5.1 2. ]
    [6.4 2.7 5.3 1.9]
    [6.8 3. 5.5 2.1]
    [5.7 2.5 5. 2. ]
    [5.8 2.8 5.1 2.4]
    [6.4 3.2 5.3 2.3]
    [6.5 3. 5.5 1.8]
    [7.7 3.8 6.7 2.2]
    [7.7 2.6 6.9 2.3]
    [6. 2.2 5. 1.5]
    [6.9 3.2 5.7 2.3]
    [5.6 2.8 4.9 2. ]
    [7.7 2.8 6.7 2. ]
    [6.3 2.7 4.9 1.8]
    [6.7 3.3 5.7 2.1]
    [7.2 3.2 6. 1.8]
    [6.2 2.8 4.8 1.8]
    [6.1 3. 4.9 1.8]
    [6.4 2.8 5.6 2.1]
    [7.2 3. 5.8 1.6]
    [7.4 2.8 6.1 1.9]
    [7.9 3.8 6.4 2. ]
    [6.4 2.8 5.6 2.2]
    [6.3 2.8 5.1 1.5]
    [6.1 2.6 5.6 1.4]
    [7.7 3. 6.1 2.3]
    [6.3 3.4 5.6 2.4]
    [6.4 3.1 5.5 1.8]
    [6. 3. 4.8 1.8]
    [6.9 3.1 5.4 2.1]
    [6.7 3.1 5.6 2.4]
    [6.9 3.1 5.1 2.3]
    [5.8 2.7 5.1 1.9]
    [6.8 3.2 5.9 2.3]
    [6.7 3.3 5.7 2.5]
    [6.7 3. 5.2 2.3]
    [6.3 2.5 5. 1.9]
    [6.5 3. 5.2 2. ]
    [6.2 3.4 5.4 2.3]
    [5.9 3. 5.1 1.8]]
    In [11]: import pandas as pd
    import numpy as np
    from sklearn import datasets
    from scipy import stats
    import matplotlib.pyplot as plt
    iris=datasets.load_iris()
    #print((iris))
    plt.figure(figsize=(12,15))
    for n in range(4):
    print(n)
    for m in range(3):
    x=(
    iris.data[m*50:m*50+50,n]-iris.data[m*50:m*50+50,n].mean())/iris.data
    [m*50:m*50+50,n].std()
    plt.subplot(4,3,n*3+m+1)
    stats.probplot(x,dist='norm',plot=plt)
    plt.text(-2,2,iris.feature_names[n])
    if n==0:
    plt.title(iris.target_names[m])
    else:
    plt.title('')
    plt.xlim([-2.5,2.5])
    plt.ylim([-2.5,2.5])
    plt.plot([-2.5,2.5],[-2.5,2.5],c='g')
    plt.savefig('chap72.png')
    
    0 
    1 
    2 
    3

    In [40]: my_data=iris.data[:,:2]
    print((my_data).shape)
    (150, 2)
    
    In [41]: from sklearn.model_selection import train_test_split
    #X_train,X_test,Y_train,Y_test=train_test_split(iris.data,iris.target,test_size=0.2,
    random_state=0)test_size=0.2,random_state=0)
    X_train,X_test,Y_train,Y_test=train_test_split(my_data,iris.target,
    test_size=0.2,random_state=0)
    test_size=0.2,random_state=0)
    from sklearn.naive_bayes import GaussianNB
    clf=GaussianNB()
    clf.fit(X_train,Y_train)
    y_pred=clf.predict(X_test)
    Y=pd.DataFrame(np.transpose([Y_test,y_pred]),columns={'true_type','predict_type'})
    Y.head(30)
    
    Out[41]: predict_type true_type
    0 2 1
    1 1 1
    2 0 0
    3 2 2
    4 0 0
    5 2 2
    6 0 0
    7 1 2
    8 1 2
    9 1 1
    10 2 1
    11 1 2
    12 1 1
    13 1 2
    14 1 1
    15 0 0
    16 1 1
    17 1 1
    18 0 0
    19 0 0
    20 2 1
    21 1 1
    22 0 0
    23 0 0
    24 2 1
    25 0 0
    26 0 0
    27 1 1
    28 1 1
    29 0 0
    
    In [42]: from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_pred,Y_test))
    [[11 0 0]
    [ 0 9 4]
    [ 0 4 2]]
    In [43]: from sklearn.metrics import classification_report
    print(classification_report(y_pred,Y_test))
    precision recall f1-score support
    0 1.00 1.00 1.00 11
    1 0.69 0.69 0.69 13
    2 0.33 0.33 0.33 6
    avg / total 0.73 0.73 0.73 30
    
    In [44]: from sklearn.metrics import roc_curve
    from sklearn.metrics import roc_auc_score
    my_auc=[]
    for n in range(4):
    #fpr,tpr,th=roc_curve(iris.target[:100],iris.data[:,n)
    my_auc.append(roc_auc_score(iris.target[:100],iris.data[:100,n]))
    print(my_auc)
    #print(iris.target[:100])
    #print(iris.data[:100,:])
    [0.9325999999999999, 0.07780000000000002, 1.0, 1.0]
    
    In [56]: plt.plot(np.ones([50,1]),iris.data[:50,0],'or')
    plt.plot(np.ones([50,1])+0.2,iris.data[50:100,0],'*g')
    plt.plot(np.ones([50,1])+1,iris.data[:50,1],'or')
    plt.plot(np.ones([50,1])+1.2,iris.data[50:100,1],'*g')
    plt.plot(np.ones([50,1])+2,iris.data[:50,2],'or')
    plt.plot(np.ones([50,1])+2.2,iris.data[50:100,2],'*g')
    plt.plot(np.ones([50,1])+3,iris.data[:50,3],'or')
    plt.plot(np.ones([50,1])+3.2,iris.data[50:100,3],'*g')
    plt.xticks([1,2,3,4],iris.feature_names)
    plt.legend(iris.target_names[:2])
    Out[56]: <matplotlib.legend.Legend at 0x272c66ebac8>

    In [21]: plt.figure(figsize=(5,12))
    plt.subplot(3,1,1)
    plt.scatter(iris.data[:50,0],iris.data[:50,1],c='r',marker='o')
    plt.scatter(iris.data[50:100,0],iris.data[50:100,1],c='b',marker='*')
    plt.subplot(3,1,2)
    plt.scatter(iris.data[:50,0],iris.data[:50,2],c='r',marker='o')
    plt.scatter(iris.data[50:100,0],iris.data[50:100,2],c='b',marker='*')
    plt.subplot(3,1,3)
    plt.scatter(iris.data[:50,0],iris.data[:50,3],c='r',marker='o')
    plt.scatter(iris.data[50:100,0],iris.data[50:100,3],c='b',marker='*')
    Out[21]: <matplotlib.collections.PathCollection at 0x272c83ff9e8>

    ?