【比赛方案】2019年CCF大数据与计算智能大赛乘用车细分市场销量预测冠军解决方案

2023-05-16

        共使用一个lgb模型与一个规则模型进行融合,总运行时长约3min 代码全部整合到一个文件里了,共约500行

        本次比赛初赛有60个车型,复赛有82个车型,由于初赛时长较长,复赛时长较短,因此复赛时我们保留了初赛60个车型的预测结果 整体的代码结构为使用初赛60车型预测60车型,复赛全部82车型预测剩下的22个车型

        由于赛题比较简单,主要的工作集中在特征工程与对赛题的分析上,思路简单

代码:

import time
import math
import warnings
import numpy as np
import pandas as pd
import lightgbm as lgb
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

#对销量采取平滑log处理
is_get_82_model,lg,log = 0, 2, 1
'*****************************************读取初赛和复赛数据*************************************************'
pre_train_sale = pd.read_csv(r'.\pre_data\train_sales_data.csv')
input_data  = pd.read_csv(r'.\data\train_sales_data.csv')
final_data  = pd.read_csv(r'.\data\evaluation_public.csv')
search_data = pd.read_csv(r'.\data\train_search_data.csv')
#将复赛新车型标记出来
pre_model = list(set(list(pre_train_sale['model'])))
input_data['new_model'] = list(map(lambda x: 1 if pre_model.count(x) == 0 else 0,input_data['model']))
final_data['new_model'] = list(map(lambda x: 1 if pre_model.count(x) == 0 else 0,final_data['model']))
def prepare(data):
    #对数据进行预处理,将各个属性转为数值特征
    data['date'] = list(map(lambda x,y:str(x)+"."+str(y),data['regYear'],data['regMonth']))
    data['date'] = pd.to_datetime(data['date'])
    if 'forecastVolum' in list(data.columns):
        data = data.drop(['forecastVolum'],axis=1)
    if 'province' in list(data.columns):
        pro_label = dict(zip(sorted(list(set(data['province']))), range(0, len(set(data['province'])))))
    model_label = dict(zip(sorted(list(set(data['model']))), range(0, len(set(data['model'])))))
    if 'bodyType' in list(data.columns):
       body_label = dict(zip(sorted(list(set(data['bodyType']))), range(0, len(set(data['bodyType'])))))
       data['body_id'] = data['bodyType'].map(body_label)
       data=data.drop(['bodyType'],axis=1)
    if 'province' in list(data.columns):
        data['pro_id'] = data['province'].map(pro_label)
    data['model_id'] = data['model'].map(model_label)
    data=data.drop(['regYear','regMonth','model'],axis=1)
    if 'province' in list(data.columns):
         data=data.drop(['adcode','province'],axis=1)
    data['month_id'] = data['date'].apply(lambda x : x.month)
    data['sales_year'] = data['date'].apply(lambda x : x.year)
    data['time_id'] = list(map(lambda x,y:(x-2016)*12+y,data['sales_year'],data['month_id']))
    data=data.drop(['date'],axis=1).rename(columns={'salesVolume':'label'})
    return data

'*****************************************预处理所有文件*************************************************'
input_data  = prepare(input_data)
final_data  = prepare(final_data)
search_data = prepare(search_data)
#将预测的文件拼接到数据集中并补全bodytype
pivot = pd.pivot_table(input_data,index=['model_id','body_id'])
pivot = pd.DataFrame(pivot).reset_index()[['model_id','body_id']]
final_data = pd.merge(final_data,pivot,on='model_id',how='left')
input_data = pd.merge(input_data,search_data,how='left',on=['pro_id','model_id','sales_year','month_id','time_id'])
input_data = pd.concat([input_data,final_data])
input_data['salesVolume'] = input_data['label']
'********************************************特征提取*****************************************************'
def get_stat_feature(df_,month):   
    data = df_.copy()
    stat_feat = []
    start = int((month-24)/3)*2
    start += int((month-24)/4)
    start = start-1 if start >=1 else start
    '历史月销量' 
    for last in range(1,17):  
        tmp=data.copy()
        tmp['time_id'] = list(map(lambda x:x+last+start if x+last+start<=28 else -1,tmp['time_id']))
        tmp = tmp[~tmp['time_id'].isin([-1])][['label','time_id','pro_id','model_id','body_id']]
        tmp = tmp.rename(columns={'label':'last_{0}_sale'.format(last)})
        data = pd.merge(data,tmp,how='left',on=['time_id','pro_id','model_id','body_id'])
        if last <= 6:
            stat_feat.append('last_{0}_sale'.format(last)) 
    '历史月popularity'
    for last in range(1,17):  
        tmp=data.copy()
        tmp['time_id']=list(map(lambda x:x+last+start if x+last+start<=28 else -1,tmp['time_id']))
        tmp=tmp[~tmp['time_id'].isin([-1])][['popularity','time_id','pro_id','model_id','body_id']]
        tmp=tmp.rename(columns={'popularity':'last_{0}_popularity'.format(last)})
        data=pd.merge(data,tmp,how='left',on=['time_id','pro_id','model_id','body_id'])
        if last<=6 or (last>=11 and last<=13):
            stat_feat.append('last_{0}_popularity'.format(last)) 

    '半年销量等统计特征'
    data['1_6_sum'] = data.loc[:,'last_1_sale':'last_6_sale'].sum(1)
    data['1_6_mea'] = data.loc[:,'last_1_sale':'last_6_sale'].mean(1)
    data['1_6_max'] = data.loc[:,'last_1_sale':'last_6_sale'].max(1)
    data['1_6_min'] = data.loc[:,'last_1_sale':'last_6_sale'].min(1)
    data['jidu_1_3_sum']  = data.loc[:,'last_1_sale':'last_3_sale'].sum(1)
    data['jidu_4_6_sum']  = data.loc[:,'last_4_sale':'last_6_sale'].sum(1)
    data['jidu_1_3_mean'] = data.loc[:,'last_1_sale':'last_3_sale'].mean(1)
    data['jidu_4_6_mean'] = data.loc[:,'last_4_sale':'last_6_sale'].mean(1)
    sales_stat_feat = ['1_6_sum','1_6_mea','1_6_max','1_6_min','jidu_1_3_sum','jidu_4_6_sum','jidu_1_3_mean','jidu_4_6_mean']
    stat_feat = stat_feat + sales_stat_feat
    
    'model_pro趋势特征'
    data['1_2_diff'] = data['last_1_sale'] - data['last_2_sale']
    data['1_3_diff'] = data['last_1_sale'] - data['last_3_sale']
    data['2_3_diff'] = data['last_2_sale'] - data['last_3_sale']
    data['2_4_diff'] = data['last_2_sale'] - data['last_4_sale']
    data['3_4_diff'] = data['last_3_sale'] - data['last_4_sale']
    data['3_5_diff'] = data['last_3_sale'] - data['last_5_sale']
    data['jidu_1_2_diff'] = data['jidu_1_3_sum'] - data['jidu_4_6_sum']
    trend_stat_feat = ['1_2_diff','1_3_diff','2_3_diff','2_4_diff','3_4_diff','3_5_diff','jidu_1_2_diff']
    stat_feat = stat_feat + trend_stat_feat

    '春节月'
    yanhaicity={1,2,5,7,9,13,16,17}
    data['is_yanhai']  = list(map(lambda x:1 if x in yanhaicity else 0,data['pro_id']))
    data['is_chunjie'] = list(map(lambda x:1 if x==2 or x==13 or x==26 else 0,data['time_id']))
    data['is_chunjie_before'] = list(map(lambda x:1 if x==1 or x==12 or x==25 else 0,data['time_id']))
    data['is_chunjie_late']   = list(map(lambda x:1 if x==3 or x==14 or x==27 else 0,data['time_id']))
    month_city_stat_feat = ['is_chunjie','is_chunjie_before','is_chunjie_late','is_yanhai']
    stat_feat = stat_feat + month_city_stat_feat
    
    '两个月销量差值'
    'model 前两个月的销量差值'
    pivot = pd.pivot_table(data,index=['model_id'],values='1_2_diff',aggfunc=np.sum)
    pivot = pd.DataFrame(pivot).rename(columns={'1_2_diff':'model_1_2_diff_sum'}).reset_index()
    data  = pd.merge(data,pivot,on=['model_id'],how='left')
    'pro 前两个月的销量差值'
    pivot = pd.pivot_table(data,index=['pro_id'],values='1_2_diff',aggfunc=np.sum)
    pivot = pd.DataFrame(pivot).rename(columns={'1_2_diff':'pro_1_2_diff_sum'}).reset_index()
    data  = pd.merge(data,pivot,on=['pro_id'],how='left')
    'model,pro 前两个月的销量差值'
    pivot = pd.pivot_table(data,index=['pro_id','model_id'],values='1_2_diff',aggfunc=np.sum)
    pivot = pd.DataFrame(pivot).rename(columns={'1_2_diff':'model_pro_1_2_diff_sum'}).reset_index()
    data  = pd.merge(data,pivot,on=['pro_id','model_id'],how='left')
    pivot = pd.pivot_table(data,index=['pro_id','model_id'],values='1_2_diff',aggfunc=np.mean)
    pivot = pd.DataFrame(pivot).rename(columns={'1_2_diff':'model_pro_1_2_diff_mean'}).reset_index()
    data  = pd.merge(data,pivot,on=['pro_id','model_id'],how='left')
    two_month_stat_feat = ['model_1_2_diff_sum','pro_1_2_diff_sum','model_pro_1_2_diff_sum','model_pro_1_2_diff_mean']
    stat_feat = stat_feat + two_month_stat_feat

    '月份'
    count_month = [31,28,31,30,31,30,31,31,30,31,30,31]
    data['count_month'] = list(map(lambda x:count_month[int(x-1)],data['month_id']))
    jiaqibiao = [[11,12,8,10,10,9,10,8,9,13,8,9],[12,9,8,11,10,8,10,8,8,14,8,10],[9,11,9,11]]
    data['count_jiaqi'] = list(map(lambda x,y:jiaqibiao[int(x-2016)][int(y-1)],data['sales_year'],data['month_id']))
    stat_feat.append('count_month')
    stat_feat.append('count_jiaqi')
    
    '环比'
    data['huanbi_1_2'] = data['last_1_sale'] / data['last_2_sale']
    data['huanbi_2_3'] = data['last_2_sale'] / data['last_3_sale']
    data['huanbi_3_4'] = data['last_3_sale'] / data['last_4_sale']
    data['huanbi_4_5'] = data['last_4_sale'] / data['last_5_sale']
    data['huanbi_5_6'] = data['last_5_sale'] / data['last_6_sale']
    ring_ratio_stat_feat = ['huanbi_1_2','huanbi_2_3','huanbi_3_4','huanbi_5_6']
    stat_feat = stat_feat + ring_ratio_stat_feat

    'add环比比'
    data['huanbi_1_2_2_3'] = data['huanbi_1_2'] / data['huanbi_2_3']
    data['huanbi_2_3_3_4'] = data['huanbi_2_3'] / data['huanbi_3_4']
    data['huanbi_3_4_4_5'] = data['huanbi_3_4'] - data['huanbi_4_5']
    data['huanbi_4_5_5_6'] = data['huanbi_4_5'] - data['huanbi_5_6']
    two_ring_ratio_stat_feat = ['huanbi_1_2_2_3','huanbi_2_3_3_4','huanbi_3_4_4_5','huanbi_4_5_5_6']
    stat_feat = stat_feat + two_ring_ratio_stat_feat

    '该月该省份bodytype销量的占比与涨幅'
    for i in range(1,7):
        last_time='last_{0}_sale'.format(i)
        pivot = pd.pivot_table(data,index=['time_id','pro_id','body_id'],values=last_time,aggfunc=np.sum)
        pivot = pd.DataFrame(pivot).rename(columns={last_time:'pro_body_last_{0}_sale_sum'.format(i)}).reset_index()
        data  = pd.merge(data,pivot,on=['time_id','pro_id','body_id'],how='left')
        data['last_{0}_sale_ratio_pro_body_last_{0}_sale_sum'.format(i,i)]=list(map(lambda x,y:x/y if y!=0 else 0,data[last_time],data['pro_body_last_{0}_sale_sum'.format(i)]))
        stat_feat.append('last_{0}_sale_ratio_pro_body_last_{0}_sale_sum'.format(i,i))
        if i>=2:
            data['last_{0}_{1}_sale_pro_body_diff'.format(i-1,i)] = data['last_{0}_sale_ratio_pro_body_last_{0}_sale_sum'.format(i-1)]-data['last_{0}_sale_ratio_pro_body_last_{0}_sale_sum'.format(i)]
            stat_feat.append('last_{0}_{1}_sale_pro_body_diff'.format(i-1,i))

    '该月该省份总销量占比与涨幅'
    for i in range(1,7):
        last_time = 'last_{0}_sale'.format(i)
        pivot = pd.pivot_table(data,index=['time_id','pro_id'],values=last_time,aggfunc=np.sum)
        pivot = pd.DataFrame(pivot).rename(columns={last_time:'pro__last_{0}_sale_sum'.format(i)}).reset_index()
        data  = pd.merge(data,pivot,on=['time_id','pro_id'],how='left')
        data['last_{0}_sale_ratio_pro_last_{0}_sale_sum'.format(i,i)]=list(map(lambda x,y:x/y if y!=0 else 0,data[last_time],data['pro__last_{0}_sale_sum'.format(i)]))
        stat_feat.append('last_{0}_sale_ratio_pro_last_{0}_sale_sum'.format(i,i))
        if i>=2:
            data['model_last_{0}_{1}_sale_pro_diff'.format(i-1,i)] = data['last_{0}_sale_ratio_pro_last_{0}_sale_sum'.format(i-1)]-data['last_{0}_sale_ratio_pro_last_{0}_sale_sum'.format(i)]
            stat_feat.append('model_last_{0}_{1}_sale_pro_diff'.format(i-1,i))

    'popularity的涨幅占比'
    data['huanbi_1_2popularity'] = (data['last_1_popularity'] - data['last_2_popularity']) / data['last_2_popularity']
    data['huanbi_2_3popularity'] = (data['last_2_popularity'] - data['last_3_popularity']) / data['last_3_popularity']
    data['huanbi_3_4popularity'] = (data['last_3_popularity'] - data['last_4_popularity']) / data['last_4_popularity']
    data['huanbi_4_5popularity'] = (data['last_4_popularity'] - data['last_5_popularity']) / data['last_5_popularity']
    data['huanbi_5_6popularity'] = (data['last_5_popularity'] - data['last_6_popularity']) / data['last_6_popularity']
    popularity_ratio_stat_feat = ['huanbi_1_2popularity','huanbi_2_3popularity','huanbi_3_4popularity','huanbi_4_5popularity','huanbi_5_6popularity']
    stat_feat = stat_feat + popularity_ratio_stat_feat

    'popu_modelpopularity'
    for i in range(1,7):
        last_time='last_{0}_popularity'.format(i)
        pivot = pd.pivot_table(data,index=['time_id','model_id'],values=last_time,aggfunc=np.sum)
        pivot = pd.DataFrame(pivot).rename(columns={last_time:'model__last_{0}_popularity_sum'.format(i)}).reset_index()
        data  = pd.merge(data,pivot,on=['time_id','model_id'],how='left')
        data['last_{0}_popularity_ratio_model_last_{0}_popularity_sum'.format(i,i)]=list(map(lambda x,y:x/y if y!=0 else 0,data[last_time],data['model__last_{0}_popularity_sum'.format(i)]))
        stat_feat.append('last_{0}_popularity_ratio_model_last_{0}_popularity_sum'.format(i,i))  

    'body month 增长率popularitydemo4'
    for i in range(1,7):
        last_time='last_{0}_popularity'.format(i)
        pivot = pd.pivot_table(data,index=['time_id','body_id'],values=last_time,aggfunc=np.sum)
        pivot = pd.DataFrame(pivot).rename(columns={last_time:'body_last_{0}_popularity_sum'.format(i)}).reset_index()
        data  = pd.merge(data,pivot,on=['time_id','body_id'],how='left')
        data['last_{0}_popularity_ratio_body_last_{0}_popularity_sum'.format(i,i)]=list(map(lambda x,y:x/y if y!=0 else 0,data[last_time],data['body_last_{0}_popularity_sum'.format(i)]))
        if i>=2:
            data['last_{0}_{1}_popularity_body_diff'.format(i-1,i)] = (data['last_{0}_popularity_ratio_body_last_{0}_popularity_sum'.format(i-1)]-data['last_{0}_popularity_ratio_body_last_{0}_popularity_sum'.format(i)])/data['last_{0}_popularity_ratio_body_last_{0}_popularity_sum'.format(i)]
            stat_feat.append('last_{0}_{1}_popularity_body_diff'.format(i-1,i)) 

    '同比一年前的增长'
    data["increase16_4"]=(data["last_16_sale"] - data["last_4_sale"]) / data["last_16_sale"]
    pivot = pd.pivot_table(data,index=["model_id","time_id"],values='last_12_sale',aggfunc=np.mean)
    pivot = pd.DataFrame(pivot).rename(columns={'last_12_sale':'mean_province'}).reset_index()
    data  = pd.merge(data,pivot,on=["model_id","time_id"],how="left")
    pivot = pd.pivot_table(data,index=["model_id","time_id"],values='last_12_sale',aggfunc=np.min)
    pivot = pd.DataFrame(pivot).rename(columns={'last_12_sale':'min_province'}).reset_index()
    data  = pd.merge(data,pivot,on=["model_id","time_id"],how="left")
    '前4个月车型的同比'
    for i in range(1,5):
        pivot = pd.pivot_table(data,index=["model_id","time_id"],values='last_{0}_sale'.format(i),aggfunc=np.mean)
        pivot = pd.DataFrame(pivot).rename(columns={'last_{0}_sale'.format(i):'mean_province_{0}'.format(i)}).reset_index()
        data  = pd.merge(data,pivot,on=["model_id","time_id"],how="left")
        pivot = pd.pivot_table(data,index=["model_id","time_id"],values='last_{0}_sale'.format(i+12),aggfunc=np.mean)
        pivot = pd.DataFrame(pivot).rename(columns={'last_{0}_sale'.format(i+12):'mean_province_{0}'.format(i+12)}).reset_index()
        data  = pd.merge(data,pivot,on=["model_id","time_id"],how="left")
    data["increase_mean_province_14_2"] = (data["mean_province_14"] - data["mean_province_2"]) / data["mean_province_14"]
    data["increase_mean_province_13_1"] = (data["mean_province_13"] - data["mean_province_1"]) / data["mean_province_13"]
    data["increase_mean_province_16_4"] = (data["mean_province_16"] - data["mean_province_4"]) / data["mean_province_16"]
    data["increase_mean_province_15_3"] = (data["mean_province_15"] - data["mean_province_3"]) / data["mean_province_15"]
    new_stat_feat = ["mean_province","min_province","increase16_4","increase_mean_province_15_3","increase_mean_province_16_4","increase_mean_province_14_2","increase_mean_province_13_1"]
    
    return data,stat_feat + new_stat_feat 

'********************************************模型训练***********************************************'
def get_model_type():   
    model = lgb.LGBMRegressor(
            num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
            max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2019,
            n_estimators=600, subsample=0.9, colsample_bytree=0.7,
            )
    return model

def get_train_model(df_, m, m_type,features, num_feat, cate_feat):
    
    df = df_.copy()
    # 数据集划分
    all_idx   = df['time_id'].between(7 , m-1)
    test_idx  = df['time_id'].between(m , m  )
    #初始化model    
    model = get_model_type()
    model.fit(df[all_idx][features], df[all_idx]['label'], categorical_feature=cate_feat,verbose=100)
    df['forecastVolum'] = model.predict(df[features]) 
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 2.0 if x < 0 else x)
    return sub

def LGB(input_data,is_get_82_model):
    #采用lightgbm销量进行预测,这里采取分月预测的形式,分别预测1 2 3 4月
    #同时,分别对初赛和复赛的车型进行分别预测,在预测初赛的车型时只使用初赛的数据,在预测复赛新加的车型时使用全部数据
    if is_get_82_model == 0:
        input_data = input_data[input_data['new_model']==0]
    input_data['label'] = list(map(lambda x : x if x==np.NAN else math.log(x+1,lg),input_data['label']))
    input_data['salesVolume'] = list(map(lambda x : x if x==np.NAN else math.log(x+1,lg),input_data['salesVolume']))
    input_data['jidu_id'] = ((input_data['month_id']-1)/3+1).map(int)
    '******************************分月预测************************************************************'
    for month in [25,26,27,28]: 
        m_type = 'lgb' 
        data_df, stat_feat = get_stat_feature(input_data,month)
        num_feat = ['sales_year']+stat_feat
        cate_feat = ['pro_id','body_id','model_id','month_id','jidu_id']
        for i in cate_feat:
            data_df[i] = data_df[i].astype('category')
        features = num_feat + cate_feat
        sub = get_train_model(data_df, month, m_type, features, num_feat, cate_feat)   
        input_data.loc[(input_data.time_id==month),  'salesVolume'] = sub['forecastVolum'].values
        input_data.loc[(input_data.time_id==month),  'label'      ] = sub['forecastVolum'].values
    input_data['salesVolume'] = list(map(lambda x : x if x==np.NAN else (lg**(x))-1, input_data['salesVolume']))
    input_data['salesVolume'] = list(map(lambda x,y: x*0.95 if y == 26 else x,input_data['salesVolume'],input_data['time_id']))
    input_data['salesVolume'] = list(map(lambda x,y: x*0.98 if y == 27 else x,input_data['salesVolume'],input_data['time_id']))
    input_data['salesVolume'] = list(map(lambda x,y: x*0.90 if y == 28 else x,input_data['salesVolume'],input_data['time_id']))
    sub = input_data.loc[(input_data.time_id >= 25),['id','salesVolume']]
    sub.columns = ['id','forecastVolum']
    sub['id'] = sub['id'].map(int)
    sub['forecastVolum'] = sub['forecastVolum'].map(round)
    return sub

def get_lgb_ans(input_data):
    #对销量进行预测,并返回最终lgb预测的结果
    print('use 60 models to train lgb model...')
    sub_60=LGB(input_data,0)
    print('use 82 models to train lgb model...')
    sub_82=LGB(input_data,1)
    input_data = pd.merge(input_data,sub_60,on='id',how='left')
    input_data = pd.merge(input_data,sub_82,on='id',how='left')
    input_data = input_data.loc[input_data.time_id>=25,['id','forecastVolum_x','forecastVolum_y']]
    input_data = input_data.fillna(-1)
    input_data['forecastVolum'] = list(map(lambda x,y:y if x==-1 else x,input_data['forecastVolum_x'],input_data['forecastVolum_y']))
    input_data = input_data[['id','forecastVolum']]
    input_data['id'] = input_data['id'].map(int)
    input_data['forecastVolum'] = input_data['forecastVolum'].map(int)
    return input_data

'********************************************rule训练***********************************************'
def exp_smooth(df,alpha=0.97,base=50,start=1,win_size=3,t=24):
    #使用三次指数平滑,根据历史销量值的变化趋势预测将来销量
    #平滑因子,两次平滑之间间隔大小,起始编号,初始值的窗口大小,平滑周期
    #第一次指数平滑
    df[start+base-1] = 0
    for i in range(win_size):
        df[start+base-1] += df[start+i] / win_size
    for i in range(t):
        df[start+base+i] =  alpha * df[start+i] + (1 - alpha) * df[start+base+i-1]
    #第二次指数平滑
    df[start+2*base-1] = 0
    for i in range(win_size):
        df[start+2*base-1] += df[start+base+i] / win_size
    for i in range(t):
        df[start+2*base+i] =  alpha * df[start+base+i] + (1 - alpha) * df[start+2*base+i-1]
    #第三次指数平滑
    df[start+3*base-1] = 0
    for i in range(win_size):
        df[start+3*base-1] += df[start+2*base+i] / win_size
    for i in range(t):
        df[start+3*base+i] =  alpha * df[start+2*base+i] + (1 - alpha) * df[start+3*base+i-1]
    #套入公式计算未来两个月的平滑值
    t1,t2,t3 = df[start+base+t-1],df[start+2*base+t-1],df[start+3*base+t-1]
    a = 3 * t1 - 3 * t2 + t3
    b = ((6 - 5 * alpha) * t1 - 2 * (5 - 4 * alpha) * t2 + (4 - 3 * alpha) * t3) * alpha / (2 * (1 - alpha) ** 2)
    c = (t1 - 2 * t2 + t3) * alpha ** 2 / (2 * (1 - alpha) ** 2)
    for m in [25,26]:
        df[m] = a + b * (m-t) + c * (m-t) ** 2
    return df
        
def pre_rule():
    #初赛60车型的规则方案
    train = pd.read_csv(r'.\pre_data\train_sales_data.csv')
    test = pd.read_csv(r'.\pre_data\evaluation_public.csv')
    #对数据取对数,缩小销量之间的差距,降低极端值的影响
    train['salesVolume'] = np.log(train['salesVolume'])
    
    #规则
    train16 = train[(train['regYear'] == 2016)][['adcode', 'model', 'regMonth', 'salesVolume']]
    train17 = train[(train['regYear'] == 2017)][['adcode', 'model', 'regMonth', 'salesVolume']]
    #下半年的趋势
    df16 = train16.loc[train16['regMonth'] > 6].groupby(['adcode', "model"], as_index=False)['salesVolume'].\
                                    agg({"16_after_mean": 'mean'}) # 按省份和车型统计均值
    df17 = train17.loc[train17['regMonth'] > 6].groupby(['adcode', "model"], as_index=False)['salesVolume'].\
                                    agg({"17_after_mean": 'mean'})
    df = pd.merge(df17, df16, on=['adcode', 'model'], how='inner')
    df['after_factor'] = df['17_after_mean'] / df['16_after_mean'] # 17年均值除以16年均值得到趋势因子
    #上半年的趋势
    df16 = train16.loc[train16['regMonth'] <= 6].groupby(['adcode', "model"], as_index=False)['salesVolume'].\
                                    agg({"16_front_mean": 'mean'}) # 按省份和车型统计均值
    df17 = train17.loc[train17['regMonth'] <= 6].groupby(['adcode', "model"], as_index=False)['salesVolume'].\
                                    agg({"17_front_mean": 'mean'})
    df17 = df17.merge(df16, on=['adcode', 'model'], how='inner')
    df['front_factor'] = df17['17_front_mean'] / df17['16_front_mean'] # 17年均值除以16年均值得到趋势因子
    #总体趋势
    df['factor'] = 0.35 * df['front_factor'] + 0.65 * df['after_factor']
    # 在省份-车型作为主键的情况下,取出16年和17年的数据,共24个月
    for m in range(1, 13):
        df = pd.merge(df, train16[train16['regMonth'] == m][['adcode', 'model', 'salesVolume']], on=['adcode', 'model'], how='left').rename(columns={'salesVolume': m})
        df = pd.merge(df, train17[train17['regMonth'] == m][['adcode', 'model', 'salesVolume']], on=['adcode', 'model'], how='left').rename(columns={'salesVolume': 12+m})
    df = exp_smooth(df,alpha=0.97)
    
    res = pd.DataFrame()
    tmp = df[['adcode', 'model']].copy()
    #后处理
    trend_factor = [0.985,0.965,0.99,0.985]
    #开始预测
    for i,m in enumerate([25,26,27,28]):
        #以省份-车型作为主键,计算前年,去年,最近几个月的值,然后加权得到一个当前月份的预测值
        last_year_base = 0.2 * df[m-13].values + 0.6 * df[m-12].values + 0.2 * df[m-11].values
        if m == 25:
            last_last_year_base = 0.8 * df[m-24] + 0.2 * df[m-23]
        else:
            last_last_year_base = 0.2 * df[m-25] + 0.6 * df[m-24] + 0.2 * df[m-23]
        if m <=26:
            near_base = 0.2 * df[m-3] + 0.2 * df[m-2] + 0.3 * df[m-1] + 0.3 * df[m]
        else:
            near_base = 0.2 * df[m-3] + 0.2 * df[m-2] + 0.6 * df[m-1]
            
        base = (last_year_base + near_base + last_last_year_base) / 3
        tmp['forecastVolum'] = base * df['factor'] * trend_factor[i]
        df[m] = tmp['forecastVolum']
        tmp['regMonth'] = m-24
        res = res.append(tmp, ignore_index=True)
    
    test = pd.merge(test[['id', 'adcode', 'model', 'regMonth']], res, how='left', on=['adcode', 'model', 'regMonth'])
    test['forecastVolum'] = np.exp(test['forecastVolum'])
    test.loc[test['forecastVolum'] < 0, 'forecastVolum'] = 0
    return test[['id', 'forecastVolum']]

def rule():
    #复赛22车型的规则方案
    train = pd.read_csv(r'.\data\train_sales_data.csv')
    test = pd.read_csv(r'.\data\evaluation_public.csv')
    #对数据取对数,缩小销量之间的差距,降低极端值的影响
    train['salesVolume'] = np.log1p(train['salesVolume'])
    
    #规则
    train16 = train[(train['regYear'] == 2016)][['adcode', 'model', 'regMonth', 'salesVolume']]
    train17 = train[(train['regYear'] == 2017)][['adcode', 'model', 'regMonth', 'salesVolume']]
    #1~3的趋势
    df16 = train16.loc[train16['regMonth'] <= 3].groupby(['adcode', "model"], as_index=False)['salesVolume'].\
                                    agg({"16_3_mean": 'mean'}) # 按省份和车型统计均值
    df17 = train17.loc[train17['regMonth'] <= 3].groupby(['adcode', "model"], as_index=False)['salesVolume'].\
                                    agg({"17_3_mean": 'mean'})
    df = pd.merge(df17, df16, on=['adcode', 'model'], how='inner')
    df['3_factor'] = df['17_3_mean'] / df['16_3_mean'] # 17年均值除以16年均值得到趋势因子
    #4~6的趋势
    df16 = train16.loc[(train16['regMonth'] >= 4) & (train16['regMonth'] <= 6)].groupby(['adcode', "model"], as_index=False)['salesVolume'].\
                                    agg({"16_6_mean": 'mean'}) # 按省份和车型统计均值
    df17 = train17.loc[(train17['regMonth'] >= 4) & (train17['regMonth'] <= 6)].groupby(['adcode', "model"], as_index=False)['salesVolume'].\
                                    agg({"17_6_mean": 'mean'})
    df17 = df17.merge(df16, on=['adcode', 'model'], how='inner')
    df['6_factor'] = df17['17_6_mean'] / df17['16_6_mean'] # 17年均值除以16年均值得到趋势因子
    #7~9的趋势
    df16 = train16.loc[(train16['regMonth'] >= 7) & (train16['regMonth'] <= 9)].groupby(['adcode', "model"], as_index=False)['salesVolume'].\
                                    agg({"16_9_mean": 'mean'}) # 按省份和车型统计均值
    df17 = train17.loc[(train17['regMonth'] >= 7) & (train17['regMonth'] <= 9)].groupby(['adcode', "model"], as_index=False)['salesVolume'].\
                                    agg({"17_9_mean": 'mean'})
    df17 = df17.merge(df16, on=['adcode', 'model'], how='inner')
    df['9_factor'] = df17['17_9_mean'] / df17['16_9_mean'] # 17年均值除以16年均值得到趋势因子
    #10~12的趋势
    df16 = train16.loc[(train16['regMonth'] >= 10) & (train16['regMonth'] <= 12)].groupby(['adcode', "model"], as_index=False)['salesVolume'].\
                                    agg({"16_12_mean": 'mean'}) # 按省份和车型统计均值
    df17 = train17.loc[(train17['regMonth'] >= 10) & (train17['regMonth'] <= 12)].groupby(['adcode', "model"], as_index=False)['salesVolume'].\
                                    agg({"17_12_mean": 'mean'})
    df17 = df17.merge(df16, on=['adcode', 'model'], how='inner')
    df['12_factor'] = df17['17_12_mean'] / df17['16_12_mean'] # 17年均值除以16年均值得到趋势因子
    #对趋势进行幂次平滑
    up_thres,down_thres,up_ratio,down_ratio = 1.2,0.75,0.5,0.5
    for factor in ['3_factor','6_factor','9_factor','12_factor']:
        df.loc[df[factor] > up_thres,factor] = df.loc[df[factor] > up_thres,factor].apply(lambda x:x**up_ratio)
        df.loc[df[factor] < down_thres,factor] = df.loc[df[factor] < down_thres,factor].apply(lambda x:x**down_ratio)
    
    #总体趋势
    def calc_factor(x):
        L = list(x)
        L = sorted(L)
        return 0.6 * L[0] + 0.2 * L[1] + 0.1 * L[2] + 0.1 * L[3]
    
    df['factor'] = df[['3_factor','6_factor','9_factor','12_factor']].apply(lambda x:calc_factor(x),axis=1)
    #对整体趋势进行后处理
    df['factor'] = df['factor'].apply(lambda x:min(x,1.25))
    df['factor'] = df['factor'].apply(lambda x:max(x,0.75))
    
    # 在省份-车型作为主键的情况下,取出16年和17年的数据,共24个月
    for m in range(1, 13):
        df = pd.merge(df, train16[train16['regMonth'] == m][['adcode', 'model', 'salesVolume']], on=['adcode', 'model'], how='left').rename(columns={'salesVolume': m})
        df = pd.merge(df, train17[train17['regMonth'] == m][['adcode', 'model', 'salesVolume']], on=['adcode', 'model'], how='left').rename(columns={'salesVolume': 12+m})
    df = exp_smooth(df,alpha=0.95)
    
    res = pd.DataFrame()
    tmp = df[['adcode', 'model']].copy()
    trend_factor = [0.985,0.965,0.99,0.985]
    for i,m in enumerate([25,26,27,28]):
        #以省份-车型作为主键,计算前年,去年,最近几个月的值,然后加权得到一个当前月份的预测值
        last_year_base = 0.2 * df[m-13].values + 0.6 * df[m-12].values + 0.2 * df[m-11].values
        if m == 25:
            last_last_year_base = 0.8 * df[m-24] + 0.2 * df[m-23]
        else:
            last_last_year_base = 0.2 * df[m-25] + 0.6 * df[m-24] + 0.2 * df[m-23]
        if m <=26:
            near_base = 0.2 * df[m-3] + 0.2 * df[m-2] + 0.3 * df[m-1] + 0.3 * df[m]
        else:
            near_base = 0.2 * df[m-3] + 0.2 * df[m-2] + 0.6 * df[m-1]
        
        #按照三个的大小进行加权求和
        temp = pd.DataFrame()
        temp['near_base'] = near_base
        temp['last_year_base'] = last_year_base
        temp['last_last_year_base'] = last_last_year_base
        def calc(row):
            L = list(row)
            L = sorted(L)
            return 0.6 * L[0] + 0.2 * L[1] + 0.2 * L[2]
        
        temp['base'] = temp.apply(lambda row:calc(row),axis=1)
        base = temp['base']
        tmp['forecastVolum'] = base * df['factor'] * trend_factor[i]
        df[m] = tmp['forecastVolum']
        tmp['regMonth'] = m-24
        res = res.append(tmp, ignore_index=True)
    
    test = pd.merge(test[['id', 'adcode', 'model', 'regMonth']], res, how='left', on=['adcode', 'model', 'regMonth'])
    test['forecastVolum'] = np.exp(test['forecastVolum'])-1
    test.loc[test['forecastVolum'] < 0, 'forecastVolum'] = 0
    #初赛60个车型
    pre_sub = pre_rule()
    pre_test = pd.read_csv(r'.\pre_data\evaluation_public.csv')
    pre_test['forecastVolum'] = pre_sub['forecastVolum']
    pre_test.rename(columns={'forecastVolum':'pre_fore'},inplace=True)
    test = test.merge(pre_test[['adcode','model','regMonth','pre_fore']],on=['adcode','model','regMonth'],how='left')
    test.loc[~test.pre_fore.isnull(),'forecastVolum'] = test.loc[~test.pre_fore.isnull(),'pre_fore']
    return test[['id', 'forecastVolum']]

'*********************************几何加权********************************'
def fusion(sub,sub_rule,sub_lgb):
    sub['rule'] = sub_rule['forecastVolum'].values
    sub['lgb'] = sub_lgb['forecastVolum'].values
    '60个车型1-4月融合'
    sub['forecastVolum'] = -1
    sub['forecastVolum'] = list(map(lambda x,y,z,m,f:(math.pow(x,0.40) * math.pow(y,0.60)) if z==0 and m==25 else f,sub['rule'],sub['lgb'],sub['new_model'],sub['time_id'],sub['forecastVolum']))
    sub['forecastVolum'] = list(map(lambda x,y,z,m,f:(math.pow(x,0.40) * math.pow(y,0.60)) if z==0 and m==26 else f,sub['rule'],sub['lgb'],sub['new_model'],sub['time_id'],sub['forecastVolum']))
    sub['forecastVolum'] = list(map(lambda x,y,z,m,f:(math.pow(x,0.50) * math.pow(y,0.50)) if z==0 and m==27 else f,sub['rule'],sub['lgb'],sub['new_model'],sub['time_id'],sub['forecastVolum']))
    sub['forecastVolum'] = list(map(lambda x,y,z,m,f:(math.pow(x,0.40) * math.pow(y,0.60)) if z==0 and m==28 else f,sub['rule'],sub['lgb'],sub['new_model'],sub['time_id'],sub['forecastVolum']))
    '22个车型1-4月融合'
    sub['forecastVolum'] = list(map(lambda x,y,z,m,f:(math.pow(x,0.35) * math.pow(y,0.65)) if z==1 and m<=26 else f,sub['rule'],sub['lgb'],sub['new_model'],sub['time_id'],sub['forecastVolum']))
    sub['forecastVolum'] = list(map(lambda x,y,z,m,f:(math.pow(x,0.40) * math.pow(y,0.60)) if z==1 and m==27 else f,sub['rule'],sub['lgb'],sub['new_model'],sub['time_id'],sub['forecastVolum']))
    sub['forecastVolum'] = list(map(lambda x,y,z,m,f:(math.pow(x,0.40) * math.pow(y,0.60)) if z==1 and m==28 else f,sub['rule'],sub['lgb'],sub['new_model'],sub['time_id'],sub['forecastVolum']))
    sub = sub[['id','forecastVolum']]
    sub['id'] = sub['id'].map(int)
    sub['forecastVolum'] = sub['forecastVolum'].map(int)
    return sub

if __name__=="__main__":
    start = time.clock()
    print('train lgb model...')
    sub_lgb = get_lgb_ans(input_data)
    print('train rule model...')
    sub_rule = rule()
    print('blend lgb and rule...')
    sub = fusion(final_data, sub_rule, sub_lgb)
    print('save final result...')
    sub.to_csv(r'.\sub\sub.csv',index=False)
    print('all procedures are over...')
    print("time used: {0} seconds...".format(int((time.clock() - start))))

本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)

【比赛方案】2019年CCF大数据与计算智能大赛乘用车细分市场销量预测冠军解决方案 的相关文章

随机推荐