当前位置 博文首页 > boysoft2002的专栏:南大《探索数据的奥秘》课件示例代码笔记14

    boysoft2002的专栏:南大《探索数据的奥秘》课件示例代码笔记14

    作者:[db:作者] 时间:2021-06-11 21:17

    Chp7-4
    2019 年 12 月 23 日

    In [32]: import numpy as np
    import pandas as pd
    import random
    from scipy import stats
    from matplotlib import pyplot as plt
    x1=[]
    x2=[]
    y1=[]
    y2=[]
    for n in range(40):
    x1.append(random.gauss(-1,0.25))
    x2.append(random.gauss(-1,0.25))
    y1.append(random.gauss(1,0.25))
    y2.append(random.gauss(1,0.25))
    plt.figure(figsize=(8,6))
    plt.plot(x1,y1,'og')
    plt.plot(x1,x2,'og')
    plt.plot(y1,x1,'og')
    plt.plot(y1,y2,'og')
    plt.xlim([-2.5,2.5])
    plt.ylim([-2.5,2.5])
    plt.savefig('cluster')

    In [41]: plt.figure(figsize=(8,6))
    plt.plot(x1,y1,'og')
    plt.plot(x1,x2,'og')
    plt.plot(y1,x1,'og')
    plt.plot(y1,y2,'og')
    plt.xlim([-2.5,2.5])
    plt.ylim([-2.5,2.5])
    plt.plot(np.mean(x1),np.mean(y1),'*r',markersize=15)
    plt.plot(np.mean(x1),np.mean(x2),'*r',markersize=15)
    plt.plot(np.mean(y1),np.mean(x1),'*r',markersize=15)
    plt.plot(np.mean(y1),np.mean(y2),'*r',markersize=15)
    plt.savefig('cluster2')

    In [51]: my_data=pd.read_csv('C:\Python\Scripts\my_data\\tmdb_5000_movies.csv')
    my_data.describe()
    Out[51]: budget id popularity revenue runtime \
    count 4.803000e+03 4803.000000 4803.000000 4.803000e+03 4801.000000
    mean 2.904504e+07 57165.484281 21.492301 8.226064e+07 106.875859
    std 4.072239e+07 88694.614033 31.816650 1.628571e+08 22.611935
    min 0.000000e+00 5.000000 0.000000 0.000000e+00 0.000000
    25% 7.900000e+05 9014.500000 4.668070 0.000000e+00 94.000000
    50% 1.500000e+07 14629.000000 12.921594 1.917000e+07 103.000000
    75% 4.000000e+07 58610.500000 28.313505 9.291719e+07 118.000000
    max 3.800000e+08 459488.000000 875.581305 2.787965e+09 338.000000
    vote_average vote_count
    count 4803.000000 4803.000000
    mean 6.092172 690.217989
    std 1.194612 1234.585891
    min 0.000000 0.000000
    25% 5.600000 54.000000
    50% 6.200000 235.000000
    75% 6.800000 737.000000
    max 10.000000 13752.000000
    
    In [57]: from sklearn.cluster import KMeans
    X=my_data[['budget','popularity','revenue']]
    km=KMeans(n_clusters=3,random_state=1)
    km.fit(X)
    my_cl=pd.DataFrame(data=km.labels_,columns=['cluster'])
    X=pd.concat([X,my_cl],axis=1)
    X.head(5)
    Out[57]: budget popularity revenue cluster
    0 237000000 150.437577 2787965087 2
    1 300000000 139.082615 961000000 2
    2 245000000 107.376788 880674609 2
    3 250000000 112.312950 1084939099 2
    4 260000000 43.926995 284139100 0
    
    In [58]: X.groupby('cluster').mean()
    Out[58]: budget popularity revenue
    cluster
    0 7.318659e+07 45.302377 2.566544e+08
    1 1.721542e+07 14.292629 2.707764e+07
    2 1.496765e+08 110.824122 8.091626e+08
    In [126]: x=X['budget']
    y=X['popularity']
    z=X['revenue']
    colors=list()
    palette={0:"red",1:"green",2:"blue"}
    # 字典,给三种类别对应散点图中的三种 marker_color
    for n,row in enumerate(X['cluster']): # 根据类别为每个样本设置绘图颜色
    colors.append(palette[X['cluster'][n]])#
    fig = plt.figure(figsize=(12,10))
    ax = fig.gca(projection='3d')
    ax.scatter(x,y,z,color=colors)
    ax.set_xlim(0,2e8)
    ax.set_zlim(0,1e9)
    ax.set_xlabel('budget',size=15)
    ax.set_ylabel('popularity',size=15)
    ax.set_zlabel('revenue',size=15)
    fig.savefig('cluster3')

    ?