当前位置 博文首页 > boysoft2002的专栏:南大《探索数据的奥秘》课件示例代码笔记16

    boysoft2002的专栏:南大《探索数据的奥秘》课件示例代码笔记16

    作者:[db:作者] 时间:2021-06-11 21:16

    Chp8-2
    2019 年 12 月 23 日

    In [3]: from sklearn.model_selection import train_test_split
    import numpy as np
    import pandas as pd
    from matplotlib import pyplot as plt
    df=pd.read_csv('C:\Python\Scripts\my_data\iris.csv',header=None,
    names=['sepal_length','sepal_width','petal_length','
    petal_width','target'])
    my_data=df[['sepal_length','
    sepal_width']].iloc[:50]
    def rmse(x,y,coefs): # 注意,自定义函数的语法
    yfit=np.polyval(coefs,x)
    rmse=np.sqrt(np.mean((y-yfit)**2))
    return rmse
    xtrain,xtest,ytrain,ytest=train_test_split(my_data['sepal_length'],my_data['
    sepal_width'],test_size = 0.5)
    train_err=[]
    validation_err=[]
    degrees=range(1,8)
    for i,d in enumerate(degrees):
    p=np.polyfit(xtrain,ytrain,d)
    train_err.append(rmse(xtrain,ytrain,p))
    validation_err.append(rmse(xtest,ytest,p))
    1fig,ax=plt.subplots()
    ax.plot(degrees,validation_err,lw=2,label='testing error')
    ax.plot(degrees,train_err,lw=2,label='training error')
    ax.legend(loc=0)
    ax.set_xlabel('degree of polynomial')
    ax.set_ylabel('RMSE')
    Out[3]: Text(0,0.5,'RMSE')

    In [54]: from sklearn.model_selection import KFold
    my_data=df[['sepal_length','sepal_width']]
    nfolds=3
    fig,axes=plt.subplots(1,nfolds,figsize=(14,4))
    kf=KFold(n_splits =nfolds)
    i=0
    for training, validation in kf.split(my_data):
    x,y=my_data.iloc[training]['sepal_length'],df.iloc[training]['sepal_width']
    axes[i].plot(x,y,'ro')
    x,y=my_data.iloc[validation]['sepal_length'],df.iloc[validation]['
    sepal_width']
    axes[i].plot(x,y,'bo')
    i=i+1
    plt.tight_layout()
    

    In [61]: my_class=[]
    for n in range(150):
    if n<50:
    my_class.append(1)
    elif n<100:
    my_class.append(2)
    else:
    my_class.append(3)
    print(my_class)
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
    
    In [65]: from sklearn.model_selection import cross_val_score,train_test_split
    from sklearn.neighbors import KNeighborsClassifier
    knn1 = KNeighborsClassifier(n_neighbors=1)
    knn2 = KNeighborsClassifier(n_neighbors=1)
    knn1.fit(my_data[['sepal_length','sepal_width']],my_class) # 全部数据用来训练
    print('训练集测试集相同时,模型的性能得分是: ',knn1.score(my_data[['sepal_length','
    sepal_width']],my_class))# 在训练集上评价性能
    print('\n')scores=
    cross_val_score(knn2,my_data[['sepal_length','sepal_width']],my_class,cv=5,
    scoring='accuracy') # 交叉验证
    print('5 折交叉验证时,模型的性能平均得分是: ',
    scores.mean())
    训练集测试集相同时,模型的性能得分是: 0.9266666666666666
    5 折交叉验证时,模型的性能平均得分是: 0.7266666666666667

    ?