当前位置 博文首页 > LY的博客:机器学习SVC分类预测三个月后的股价

    LY的博客:机器学习SVC分类预测三个月后的股价

    作者:[db:作者] 时间:2021-08-09 22:10

    思路:通过学习近两年的每个季度报的基本面财务数据,建立模型,买入并持有预测三个月后会涨5%以上的股票,直到下一批季度报

    数据采集:用到了大约10018行数据(已去除缺失值,不采用填充),其中采用了两个技术指标(趋势指标CYES,CYEL)

       circulating_market_cap  operating_revenue    net_profit           roe  \
    count            10018.000000       1.001800e+04  1.001800e+04  10018.000000   
    mean               154.594481       3.588924e+09  4.234473e+08      3.140488   
    std                600.063348       2.005736e+10  3.071935e+09      3.247377   
    min                  3.120000       1.224100e+06  1.966154e+04      0.002400   
    25%                 33.570000       2.299639e+08  2.410783e+07      1.472500   
    50%                 58.854650       5.483732e+08  5.702822e+07      2.544550   
    75%                109.488325       1.511452e+09  1.540881e+08      3.915775   
    max              16176.732400       5.821850e+11  7.766000e+10     99.828000   
    
           inc_net_profit_year_on_year  inc_revenue_year_on_year       pe_ratio  \
    count                 10018.000000              10018.000000   10018.000000   
    mean                    385.339754                 91.570442     216.210794   
    std                    4309.253887               1648.839653    6348.503816   
    min                       0.017300                -98.926300       1.920300   
    25%                      18.204850                  7.450175      33.834325   
    50%                      47.288100                 23.729200      54.822600   
    75%                     140.021300                 48.867150      96.220775   
    max                  308628.250000             122241.677300  594205.375000   
    
                  CYEL1         CYES1           peg  price_increase_rate  \
    count  10018.000000  10018.000000  10018.000000         10018.000000   
    mean       0.065483     -0.031435      9.038361            -1.866428   
    std        1.070460      0.626431    128.634609            13.427512   
    min       -7.415192     -3.243705      0.000165           -56.206703   
    25%       -0.511237     -0.325856      0.455366            -9.436908   
    50%       -0.028772      0.013026      1.163501            -3.026580   
    75%        0.554540      0.262382      3.137165             3.569913   
    max       10.001834      9.773301   8321.761237           247.121711   
    
                  value  
    count  10018.000000  
    mean       0.214314  
    std        0.410366  
    min        0.000000  
    25%        0.000000  
    50%        0.000000  
    75%        0.000000  
    max        1.000000  



    数据处理:

    去除空白值、缺失值,对原始股价求出增长率*100,若增长率>5%,目标值value=1,反之为0

    计算了PE/G,按照金融指标利润,PEG越小(负值滚出克),越能体现股票的价值和成长性


    数据源来自聚宽,源码:

    import numpy as np
    #去除np默认省略
    np.set_printoptions(suppress=True)
    import pandas as pd
    import re
    import time
    import jqdata
    from jqlib.technical_analysis import *

    np.set_printoptions(threshold=np.inf)??
    def getdata(statdate):
    ? ? #字符处理
    ? ? year=statdate[0:4]
    ? ? date=statdate[-1]
    ? ? if date=='1':
    ? ? ? ? date_start='0430'
    ? ? ? ? date_end='0830'
    ? ? elif? ?date=='2':
    ? ? ? ? date_start='0831'
    ? ? ? ? date_end='1030'
    ? ? elif? ?date=='3':
    ? ? ? ? date_start='1031'
    ? ? ? ? date_end='1231'? ??
    ? ? elif? ?date=='4':
    ? ? ? ? year=str(int(year)+1)
    ? ? ? ? date_start='0429'
    ? ? ? ? date_end='0630'? ? ? ??
    ? ? date_start=year+date_start? ??
    ? ? date_end=year+date_end? ?
    ? ? print date_start,date_end
    ? ? ? ??
    ? ? q = query(
    ? ? ? ? ?valuation.code,? #代码
    ? ? ? ? ? ? valuation.circulating_market_cap,? #流通市值
    ? ? ? ? ? ? income.operating_revenue,? #营业收入
    ? ? ? ? ? ? income.net_profit,? ?#净利润
    ? ? ? ? ? ? indicator.roe ,? ?#净资产收益率
    ? ? ? ? ? ? indicator.inc_net_profit_year_on_year,? ?#同比净利润增长率
    ? ? ? ? ? ? indicator.inc_revenue_year_on_year,? ?#同比营业收入增长率
    ? ? ? ? ? ? valuation.pe_ratio


    ? ? ).order_by(

    ? ? ? ? ? ? valuation.code.asc())
    ? ? df = get_fundamentals(q, statDate=statdate)
    ? ??
    ? ? df.index=df['code']
    ? ??
    ? ? #print df['code']
    ? ? #正则取code列表
    ? ? a=df['code'].values
    ? ? a=str(a)
    ? ? #print a
    ? ? pattern=r"'(.*?)'"
    ? ? list=re.findall(pattern,a)
    ? ? #print list
    ? ? print len(list)
    ? ? #改日期格式
    ? ? date_start = time.strptime(date_start, "%Y%m%d")?
    ? ? date_start=time.strftime("%Y-%m-%d",date_start)
    ? ? print date_start
    ? ? ? ??
    ? ? date_end = time.strptime(date_end, "%Y%m%d")?
    ? ? date_end=time.strftime("%Y-%m-%d",date_end)
    ? ? print date_end

    ? ? #date_start = time.strftime("%Y/%m/%d %H:%M:%S", timeStruct)?
    ? ??
    ? ? #获取时间区间股价及股价变化
    ? ? df2 = get_price(list, start_date=date_start, end_date=date_end, frequency='daily', fields='close')
    ? ? df3=pd.DataFrame(df2['close'])
    ? ??
    ? ? #获取CYE(CYES,CYEL)
    ? ? #df['CYEL1','CYES1']= CYE(df['code'].values,check_date= date_start)

    ? ? CYEL1,CYES1 = CYE(df['code'],check_date= date_start
    ? ? df['CYEL1']=pd.Series(CYEL1)
    ? ? df['CYES1']=pd.Series(CYES1)
    ? ?
    ? ? ? ??
    ? ? # 计算PE/G
    ? ? df['peg'] = df.apply(lambda x: x['pe_ratio'] / x['inc_net_profit_year_on_year'], axis=1)
    ? ??
    ? ? #df['price_increase_rate']
    ? ??
    ? ? df4= pd.DataFrame((df3.ix[-1]-df3.ix[0])/df3.ix[0]*100)
    ? ??
    ? ? df4.columns=['price_increase_rate']
    ? ??

    ? ??
    ? ? df4['value']= df4['price_increase_rate'].where(df4['price_increase_rate']>5,0)
    ? ? df4['value']= df4['value'].where(df4['price_increase_rate']<=5,1)
    ? ? #print df
    ? ? #print df4
    ? ? #print df4.describe()
    ? ?
    ? ? #df['price_increase_rate']=
    ? ? df4=pd.concat([df,df4],axis=1)
    ? ? print df4['price_increase_rate']
    ? ? print df4.describe()

    ? ? return df4

    #list=['2016q2','2016q3','2016q4','2017q1','2017q2','2017q3']
    list=['2016q1']
    for statdate in list:
    ? ? data=getdata(statdate)
    ? ? #print data
    ? ? #print data.describe()
    ? ? string=str(statdate)+'.csv'
    ? ? data.to_csv(string)
    ? ? print'已储存%s'%(statdate)

    #读取csv
    from six import StringIO
    csvlist=['2016q2.csv','2016q3.csv','2016q4.csv','2017q1.csv','2017q2.csv','2017q3.csv']
    body=read_file(csvlist[0])
    data=pd.read_csv(StringIO(body))
    #data.index=data['code']
    i=1
    while i <= len(csvlist)-1:
    ? ? body=read_file(csvlist[i])
    ? ? data2=pd.read_csv(StringIO(body))
    ? ? #data2.index=data['code']
    ? ? #del data['code.1']
    ? ? #print data.head(20)
    ? ? data=pd.concat([data,data2],axis=0)
    ? ? i=i+1

    del data['code.1']
    print len(data)

    #去除负值(没有成长性和目前经营不善的股票)
    df=data[(data['net_profit'] >0) & (data['roe'] > 0)&(data['pe_ratio'] > 0)&(data['peg'] > 0)]
    df.dropna(inplace=True)
    print df.head(5)

    print df.describe()


    模型制作:

    数据中去除了负值(没有成长性和目前经营不善的股票),保留了高成长性和基本面优异的股票下一步

    70%训练,30%检验,数据进行了归一化。


    from sklearn import preprocessing
    from sklearn.neural_network import MLPClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import confusion_matrix
    from sklearn.svm import SVC

    # 数据集0.3 0.7分集
    X_train,X_test,y_train,y_test=train_test_split(df.ix[:,['circulating_market_cap','operating_revenue','net_profit','roe','inc_net_profit_year_on_year','inc_net_profit_year_on_year',? \
    ? ? ? ? ? ? 'pe_ratio','CYEL1','CYES1','peg']],df['value'], test_size=0.3, random_state=40)

    #model2 = MLPClassifier(activation='relu', solver='adam', alpha=0.0001)

    #做了标准化归一化
    X_scaled = preprocessing.scale(X_train)
    X_test_scaled = preprocessing.scale(X_test)

    model2 = SVC(C=1.0, kernel='rbf', gamma='auto')

    model2.fit(X_scaled,y_train)


    predict = model2.predict(X_test_scaled)


    print model2.score(X_test_scaled, y_test)

    #混淆矩阵

    print confusion_matrix(y_true=y_test, y_pred=predict)


    考虑到股票预测的特殊性,SCORE准确度虽然高达78%,但假阳性/阳性,真阳性/阳性的比率才是我所需要的

    在本次测试中,测试集一共做出了18次判断,假阳7次,真阳11次。效果看似并不理想,其实63%准确度的三个月5%的预测结果足以让投资者获得超额收益。

    ***假阳并不意味着亏损,毕竟即使涨1%,虽然不符合value判断阀门,但也是盈利。

    准确度:
    0.787425149701
    
    
    混淆矩阵:
    ????????????[[2356    7]
     ????????????[ 632   11]]


    sklearn svc模型的导入和使用

    使用sklearn的joblib包保存模型

    将新的季度财务数据导入模型源码:

    # -*- coding: utf-8 -*-
    from sklearn.externals import joblib
    from sklearn import preprocessing
    import numpy as np
    np.set_printoptions(suppress=True)
    import pandas as pd
    model = joblib.load('SVC_STOCK.model')

    #读取csv
    from six import StringIO

    ??
    body=read_file('2018q1.csv')
    data=pd.read_csv(StringIO(body))
    #data.index=data['code']

    del data['code.1']
    print len(data)


    #去除负值(没有成长性和目前经营不善的股票)
    df=data[(data['net_profit'] >0) & (data['roe'] > 0)&(data['pe_ratio'] > 0)&(data['peg'] > 0)]
    del df['price_increase_rate']
    df.dropna(axis = 0)
    print df.head(5)
    print df.describe()
    print len(df)

    X1=df.ix[:,['circulating_market_cap','operating_revenue','net_profit','roe','inc_net_profit_year_on_year','inc_net_profit_year_on_year','pe_ratio','CYEL1','CYES1','peg']]


    X_scaled=preprocessing.scale(X1)
    predict = model.predict(X_scaled)


    count=0
    for e in predict:
    ? ? if e == 1:
    ? ? ? ? count=count+1
    ? ? ? ??

    print count? ?

    11

    2018年一季报的财务结果并预测了11只股票会增长超过5%

    #输出股票代码

    df['predict']=predict

    list=[]
    list=df['code'][df['predict'] == 1].values

    print list

    ['000333.XSHE' '000651.XSHE' '000858.XSHE' '002290.XSHE' '002415.XSHE'
     '600019.XSHG' '600267.XSHG' '600276.XSHG' '600311.XSHG' '600519.XSHG'
     '600747.XSHG']


    对上述的结果进行回测检验

    joinquant 回测代码:

    # 导入函数库
    import jqdata
    ? ??
    from datetime import *?
    import math
    import numpy as np
    import pandas as pd

    # 初始化函数,设定基准等等
    def initialize(context):
    ? ? # 设定沪深300作为基准
    ? ? set_benchmark('000300.XSHG')? ? ? ??
    ? ? # 开启动态复权模式(真实价格)
    ? ? set_option('use_real_price', True)
    ? ??
    ? ? #2015Q2
    ? ? #g.stock=['000554.XSHE' ,'000687.XSHE', '600321.XSHG' ,'600519.XSHG' ,'600533.XSHG','600629.XSHG']
    ? ? #2016Q1
    ? ? g.stock_2016q1=['000892.XSHE', '600519.XSHG', '601766.XSHG']
    ? ? #2016Q2
    ? ? g.stock_2016q2=['000609.XSHE' ,'600519.XSHG' ,'600984.XSHG', '601398.XSHG', '601766.XSHG','601800.XSHG']
    ? ??
    ? ??
    ? ? #2016Q3
    ? ? g.stock_2016q3=['000002.XSHE' ,'000333.XSHE' ,'600519.XSHG' ,'601318.XSHG', '601628.XSHG','601800.XSHG']
    ? ??
    ? ??
    ? ? #2016Q4
    ? ? g.stock_2016q4=['600519.XSHG', '601766.XSHG']
    ? ??
    ? ? #2017Q1
    ? ? g.stock_2017q1=['000333.XSHE', '000557.XSHE', '000651.XSHE', '002122.XSHE' ,'002141.XSHE',
    ? ? '002265.XSHE' ,'002346.XSHE', '300093.XSHE' ,'300338.XSHE' ,'300391.XSHE',
    ? ? '600019.XSHG', '600251.XSHG' ,'600519.XSHG', '600693.XSHG', '600817.XSHG',
    ? ? '600876.XSHG' ,'601628.XSHG']


    ? ? #2017Q2
    ? ? g.stock_2017q2=['000002.XSHE', '000333.XSHE', '000651.XSHE' ,'000858.XSHE' ,'002415.XSHE',
    ? ? '300618.XSHE' ,'600519.XSHG', '601601.XSHG']
    ? ??
    ? ? #2017Q3
    ? ? g.stock_2017q3=['000002.XSHE' ,'000333.XSHE' ,'000651.XSHE' ,'002192.XSHE' ,'300354.XSHE',
    ? ? '600019.XSHG' ,'600238.XSHG', '600519.XSHG' ,'600769.XSHG' ,'601177.XSHG',
    ? ? '601318.XSHG', '601601.XSHG']
    ? ??
    ? ? #2018q1
    ? ? g.stock_2018q1=['000333.XSHE', '000651.XSHE' ,'000858.XSHE', '002290.XSHE' ,'002415.XSHE',
    ? ? '600019.XSHG', '600267.XSHG' ,'600276.XSHG' ,'600311.XSHG' ,'600519.XSHG',
    ? ? '600747.XSHG']
    ? ??
    ?
    ? ? #? ? ?0430-0830,0831-1030,1031-0430,0429-0629(年报不用,发布太慢)
    ? ? #2016q1-2018q1
    ? ? # 过滤掉order系列API产生的比error级别低的log
    ? ? # log.set_level('order', 'error')
    ? ??
    ? ? ### 股票相关设定 ###
    ? ? # 股票类每笔交易时的手续费是:买入时佣金万分之三,卖出时佣金万分之三加千分之一印花税, 每笔交易佣金最低扣5块钱
    ? ? set_order_cost(OrderCost(close_tax=0.001, open_commission=0.0003, close_commission=0.0003, min_commission=5), type='stock')
    ? ??
    ? ? ## 运行函数(reference_security为运行时间的参考标的;传入的标的只做种类区分,因此传入'000300.XSHG'或'510300.XSHG'是一样的)
    ? ? ? # 开盘前运行
    ? ? run_daily(before_market_open, time='before_open', reference_security='000300.XSHG')?
    ? ? ? # 开盘时或每分钟开始时运行
    ? ? run_daily(market_open, time='every_bar', reference_security='000300.XSHG')
    ? ? ? # 收盘后运行
    ? ? run_daily(after_market_close, time='after_close', reference_security='000300.XSHG')
    ? ??
    ## 开盘前运行函数? ? ?
    def before_market_open(context):
    ? ? #context.current_dt
    ? ? #引用股票列表
    ? ? if date(2016, 03, 31)<context.current_dt.date()<= date(2016, 8, 30):
    ? ? ? ? g.stock=g.stock_2016q1


    ? ? elif? date(2016, 8, 31)<=context.current_dt.date()<= date(2016, 10, 30):
    ? ? ? ? g.stock=g.stock_2016q2


    ? ? elif? date(2016, 10, 31)<=context.current_dt.date()<= date(2017, 04, 30):
    ? ? ? ? g.stock=g.stock_2016q3
    ? ??
    ? ? if date(2017, 05, 01)<=context.current_dt.date()<= date(2017, 8, 30):
    ? ? ? ? g.stock=g.stock_2017q1


    ? ? elif? date(2017, 8, 31)<=context.current_dt.date()<= date(2017, 10, 30):
    ? ? ? ? g.stock=g.stock_2017q2


    ? ? elif? date(2017, 10, 31)<=context.current_dt.date()<= date(2018, 04, 30):
    ? ? ? ? g.stock=g.stock_2017q3
    ? ??
    ? ? else:
    ? ? ? ? g.stock=g.stock_2018q1
    ? ? ? ??
    ? ? ? ? '''
    ? ? ? ? elif date(2017, 05, 01)<=context.previous_date<= date(2017, 8, 30):
    ? ? ? ? g.stock=g.stock_2018q1


    ? ? ? ? elif? date(2016, 8, 31)<=context.previous_date<= date(2016, 10, 30):
    ? ? ? ? g.stock=g.stock_2016q2


    ? ? ? ? elif? date(2016, 10, 31)<=context.previous_date<= date(2017, 04, 30):
    ? ? ? ? g.stock=g.stock_2016q3
    ? ? ? ? '''
    ? ? #过滤st和停牌
    ? ? remove_paused_and_st(g.stock)
    ??
    ? ??
    ## 开盘时运行函数
    def market_open(context):
    ? ? cash = context.portfolio.available_cash
    ? ??
    ? ? if len( context.portfolio.positions)>0:
    ? ? ? ? for f in context.portfolio.positions:
    ? ? ? ? ? ? if f not in g.stock:
    ? ? ? ? ? ? ? ? order_target(f, 0)
    ? ? ? ? ? ? ? ? ? ??
    ? ??
    ? ? for e in g.stock:
    ? ? ? ? if context.portfolio.available_cash>100000 :
    ? ? ? ? ? ? ?order_value(e, 100000)
    ? ??
    ? ?
    ?
    ## 收盘后运行函数??
    def after_market_close(context):
    ? ? pass




    def remove_paused_and_st(stock_list):
    ? ? current_data = get_current_data(stock_list)
    ? ? return [stock for stock in stock_list if not current_data[stock].paused?
    ? ? ? ? and not current_data[stock].is_st?
    ? ? ? ? and 'ST' not in current_data[stock].name?
    ? ? ? ? and '*' not in current_data[stock].name?
    ? ? ? ? and '退' not in current_data[stock].name]
    ? ? ? ??

    这个策略只择股不择时,不止损和无风控,纯属毛坯房。未来将考虑使用技术指标的改进版进行择时交易,并进行止盈止损。


    最大回测20%(时间区间为今年初A股受中美经济战崩盘),事实上201805月已明显反弹股价,并表现明显强于沪深300

    cs