机器学习相关操作方法分享(五)
数据准备阶段
导入基本的包
import osimport pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
plt.rcParams['font.family'] = ['sans-serif']
import seaborn as sns
import lightgbm as lgbfrom sklearn.model_selection import StratifiedKFoldimport numpy as npfrom sklearn.metrics import f1_score
from gensim.corpora import WikiCorpusfrom gensim.models import Word2Vecfrom gensim.models.word2vec import LineSentenceimport multiprocessing
from sklearn.manifold import TSNE
import warningswarnings.filterwarnings("ignore")
导入数据
path = './data/'
train = pd.read_csv(path + 'train_2.csv')test = pd.read_csv(path + 'test_2.csv')
迁移学习构造的特征
train_stacking = pd.read_csv(path + '/stack/train.csv')
test_stacking = pd.read_csv(path + '/stack/test.csv')
print(len(train), len(test))
train = train.merge(train_stacking, 'left', 'user_id')
test = test.merge(test_stacking, 'left', 'user_id')
print(len(train), len(test))
print(len(train), len(test))train.info()
发现存在object类型的特征,需要进行转换
for i in train['2_total_fee'].values:try:tmp = float(i)except:print(i)for i in train['3_total_fee'].values:try:tmp = float(i)except:print(i)train = train.replace('\N', 0)train['2_total_fee'] = train['2_total_fee'].astype(float)train['3_total_fee'] = train['3_total_fee'].astype(float)train.info()test.info()for i in test['2_total_fee'].values:try:tmp = float(i)except:print(i)test = test.replace('\N', 0)test['2_total_fee'] = test['2_total_fee'].astype(float)
USERID | 用户ID | VARCHAR2(50) | 用户编码,标识用户的唯一字段 |
current_service | 套餐 | VARCHAR2(500) | / |
service_type | 套餐类型 | VARCHAR2(10) | 0:23G融合,1:2I2C,2:2G,3:3G,4:4G |
is_mix_service | 是否固移融合套餐 | VARCHAR2(10) | 1.是 0.否 |
online_time | 在网时长 | VARCHAR2(50) | / |
1_total_fee | 当月总出账金额_月 | NUMBER | 单位:元 |
2_total_fee | 当月前1月总出账金额_月 | NUMBER | 单位:元 |
3_total_fee | 当月前2月总出账金额_月 | NUMBER 单位:元 | |
4_total_fee | 当月前3月总出账金额_月 | NUMBER | 单位:元 |
month_traffic | 当月累计-流量 | NUMBER | 单位:MB |
many_over_bill | 连续超套 | VARCHAR2(500) | 1-是,0-否 |
contract_type | 合约类型 | VARCHAR2(500) | ZBG_DIM.DIM_CBSS_ACTIVITY_TYPE |
contract_time | 合约时长 | VARCHAR2(500) | / |
is_promise_low_consume | 是否承诺低消用户 | VARCHAR2(500) | 1.是 0.否 |
net_service | 网络口径用户 | VARCHAR2(500) | 20AAAAAA-2G |
pay_times | 交费次数 | NUMBER | 单位:次 |
pay_num | 交费金额 | NUMBER | 单位:元 |
last_month_traffic | 上月结转流量 | NUMBER | 单位:MB |
local_trafffic_month | 月累计-本地数据流量 | NUMBER | 单位:MB |
local_caller_time | 本地语音主叫通话时长 | NUMBER | 单位:分钟 |
service1_caller_time | 套外主叫通话时长 | NUMBER | 单位:分钟 |
service2_caller_time | Service2_caller_time | NUMBER | 单位:分钟 |
gender | 性别 | varchar2(100) | 01.男 02女 |
age | 年龄 | varchar2(100) | / |
complaint_level | 投诉重要性 | VARCHAR2(1000) | 1:普通,2:重要,3:重大 |
former_complaint_num | 交费金历史投诉总量 | NUMBER | 单位:次 |
former_complaint_fee | 历史执行补救费用交费金额 | NUMBER | 单位:分 |
数据分析
查看训练集中套餐的统计信息及分布
train['current_service'].value_counts()999999套餐特别的少,初步看作是异常值,毕竟类别少的是很难学习到有效信息的
单变量分析
train['age'].hist(bins=70)由图中可以看出,客户数量最大的年龄段为17-31岁,其中22岁客户占比5%,远超平均水平1.7%。可见客户群体和年龄存在较大关联,且年轻人群占有较大比重。train['gender'].hist(bins=70)客户性别分布明显,性别1 客户数量为性别2 客户数量的2倍多,性别的影响非常大。
我们观察到性别中有0 的缺省值,对于这部分,我们使用了两种方法处理,一种是填充service_type(或更细致)对应字段的众数,和原始值。最终我们选取了原始值,我们认为默性别在不同套餐中的转换率呈现了分布差异。train['service_type'].hist(bins=70)train['service_type'].value_counts()服务类型集中在1和4,3很大可能为异常值train['complaint_level'].hist(bins=70)投诉重要性大部分为0
多变量分析
fig,ax = plt.subplots(figsize=(10,10))service = train['current_service'].unique()for i in service:train[train['current_service']==i]['1_total_fee'].hist(bins=100,label=str(i),ax=ax,alpha=0.5)plt.xlim([0,500])plt.legend()观察1_total_fee(当月总出账金额_月)与套餐的分布,还是有很明显的区分度的,当月总出账金额很大程度上影响套餐类型的。后续可以多考虑从1_total_fee、2_total_fee、3_total_fee、4_total_fee中挖掘更多有用特征。
一般而言,有区分度的特征都为优质特征。sns.jointplot(x='age',y='1_total_fee',data = train)话费与年龄存在明显关系,除了年龄为0外,高花费集中区域很明显sns.jointplot(x='age',y='month_traffic',data = train)train.groupby(['current_service'])['1_total_fee'].agg({'count','mean'}).reset_index()train[train.service_type!=3].groupby(['current_service','service_type'])['user_id'].agg({'count'})我们可以得到一个明显的规律,service_type可以将套餐分为两个部分,这两部分是没有交叉的,其中一类有8个,另外一类有3个。这给我们比赛带来一个思路是,可以分模型预测作为一个尝试的方向。(不考虑service_type=3的情况下)
特征工程
数据预处理
data = pd.concat([train, test], ignore_index=True).fillna(0)
current_service为目标
data['label'] = data.current_service.astype(int)data = data.replace('\N', 0)
data['gender'] = data.gender.astype(int)data['service_type'].value_counts()data.loc[data['service_type'] == 3, 'service_type'] = 4原始特征分类origin_cate_feature = ['service_type', 'complaint_level', 'contract_type', 'gender', 'is_mix_service','is_promise_low_consume','many_over_bill', 'net_service']
origin_num_feature = ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee','age', 'contract_time','former_complaint_fee', 'former_complaint_num','last_month_traffic', 'local_caller_time', 'local_trafffic_month', 'month_traffic','online_time', 'pay_num', 'pay_times', 'service1_caller_time', 'service2_caller_time']类型转换
由于部分特征为object类型for i in origin_num_feature:data[i] = data[i].astype(float)
Embedding 特征
这里使用Word2Vec构建embedding特征from gensim.corpora import WikiCorpusfrom gensim.models import Word2Vecfrom gensim.models.word2vec import LineSentenceimport multiprocessing
L = 10
sentence = []for line in list(data[['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']].values):sentence.append([str(float(l)) for idx, l in enumerate(line)])
print('training...')model = Word2Vec(sentence, size=L, window=2, min_count=1, workers=multiprocessing.cpu_count(),iter=10)print('outputing...')
for fea in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']:values = []for line in list(data[fea].values):values.append(line)values = set(values)print(len(values))w2v = []for i in values:a = [i]a.extend(model[str(float(i))])w2v.append(a)out_df = pd.DataFrame(w2v)
name = [fea] for i in range(L): name.append(name[0] + 'W' + str(i)) out_df.columns = name out_df.to_csv( './data/w2v/' + fea + '.csv', index=False)
from sklearn.manifold import TSNEimport matplotlib.pyplot as pltdf=pd.read_csv('./data/w2v/3_total_fee.csv')l=list(df['3_total_fee'].astype('str'))name=list(df)
def plot_with_labels(low_dim_embs, labels, filename = 'tsne.png'):assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"plt.figure(figsize= (10, 18))for i, label in enumerate(labels):x, y = low_dim_embs[i, :]plt.scatter(x, y)plt.annotate(label, xy = (x, y), textcoords = 'offset points', ha = 'right', va = 'bottom')plt.savefig(filename)
tsne = TSNE(perplexity = 30, n_components = 2, init = 'pca', n_iter = 5000)
plot_only = 300low_dim_embs = tsne.fit_transform(df.iloc[:plot_only][name[1:]])labels = [l[i] for i in range(plot_only)]plot_with_labels(low_dim_embs, labels)
考虑对Embedding特征进行聚类
观察聚类结果对套餐的区分度w2v_features = []for col in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']:df = pd.read_csv('./data/w2v/' + col + '.csv')df = df.drop_duplicates([col])fs = list(df)fs.remove(col)w2v_features += fsprint(len(data))data = pd.merge(data, df, on=col, how='left')print(len(data))print(w2v_features)
统计特征
train['1_total_fee'].value_counts().head(20).plot(kind='bar', figsize=(16,9))是否由区分度呢?train[train['1_total_fee']==106]['current_service'].value_counts()count_feature_list = []
def feature_count(data, features=[]):if len(set(features)) != len(features):print('equal feature !!!!')return datanew_feature = 'count'for i in features:new_feature += '' + i.replace('add', '')temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})data = data.merge(temp, 'left', on=features)count_feature_list.append(new_feature)
# 迁移特征 if 'service_type' in features: temp_2 = train_first.groupby(features).size().reset_index().rename(columns={0: 'train_' + new_feature}) data = data.merge(temp_2, 'left', on=features) count_feature_list.append('train_' + new_feature) return data
data = feature_count(data, ['1_total_fee'])data = feature_count(data, ['2_total_fee'])data = feature_count(data, ['3_total_fee'])data = feature_count(data, ['4_total_fee'])
data = feature_count(data, ['former_complaint_fee'])
data = feature_count(data, ['pay_num'])data = feature_count(data, ['contract_time'])data = feature_count(data, ['last_month_traffic'])data = feature_count(data, ['online_time'])
迁移特征
for i in ['service_type', 'contract_type']:
data = feature_count(data, [i, '1_total_fee'])
data = feature_count(data, [i, '2_total_fee'])
data = feature_count(data, [i, '3_total_fee'])
data = feature_count(data, [i, '4_total_fee'])
data = feature_count(data, [i, 'former_complaint_fee'])
data = feature_count(data, [i, 'pay_num'])
data = feature_count(data, [i, 'contract_time'])
data = feature_count(data, [i, 'last_month_traffic'])
data = feature_count(data, [i, 'online_time'])
train1 = pd.read_csv(path + 'train_1.csv')test1 = pd.read_csv(path + 'test_1.csv')
data1 = pd.concat([train1, test1], ignore_index=True).fillna(0)
current_service为目标
data1['label'] = data1.current_service.astype(int)data1 = data1.replace('\N', 0)
data1['gender'] = data1.gender.astype(int)train['current_service'].value_counts()train1['current_service'].value_counts()train1['1_total_fee'].value_counts().head(20).plot(kind='bar', figsize=(16,9))其实从最基本的count统计就能看出初赛数据和复赛数据之间的差异
差值特征
diff_feature_list = ['diff_total_fee_1', 'diff_total_fee_2', 'diff_total_fee_3', 'last_month_traffic_rest','rest_traffic_ratio','total_fee_mean', 'total_fee_max', 'total_fee_min', 'total_caller_time', 'service2_caller_ratio','local_caller_ratio','total_month_traffic', 'month_traffic_ratio', 'last_month_traffic_ratio', 'pay_num_1_total_fee','1_total_fee_call_fee', '1_total_fee_call2_fee', '1_total_fee_trfc_fee']
data['diff_total_fee_1'] = data['1_total_fee'] - data['2_total_fee']data['diff_total_fee_2'] = data['2_total_fee'] - data['3_total_fee']data['diff_total_fee_3'] = data['3_total_fee'] - data['4_total_fee']
data['pay_num_1_total_fee'] = data['pay_num'] - data['1_total_fee']
data['last_month_traffic_rest'] = data['month_traffic'] - data['last_month_traffic']data['last_month_traffic_rest'][data['last_month_traffic_rest'] < 0] = 0data['rest_traffic_ratio'] = (data['last_month_traffic_rest'] * 15 / 1024) / data['1_total_fee']
total_fee = []for i in range(1, 5):total_fee.append(str(i) + '_total_fee')data['total_fee_mean'] = data[total_fee].mean(1)data['total_fee_max'] = data[total_fee].max(1)data['total_fee_min'] = data[total_fee].min(1)
data['total_caller_time'] = data['service2_caller_time'] + data['service1_caller_time']data['service2_caller_ratio'] = data['service2_caller_time'] / data['total_caller_time']data['local_caller_ratio'] = data['local_caller_time'] / data['total_caller_time']
data['total_month_traffic'] = data['local_trafffic_month'] + data['month_traffic']data['month_traffic_ratio'] = data['month_traffic'] / data['total_month_traffic']data['last_month_traffic_ratio'] = data['last_month_traffic'] / data['total_month_traffic']
data['1_total_fee_call_fee'] = data['1_total_fee'] - data['service1_caller_time'] * 0.15data['1_total_fee_call2_fee'] = data['1_total_fee'] - data['service2_caller_time'] * 0.15data['1_total_fee_trfc_fee'] = data['1_total_fee'] - (data['month_traffic'] - 2 * data['last_month_traffic']) * 0.3data.loc[data.service_type == 1, '1_total_fee_trfc_fee'] = None
cate_feature = origin_cate_featurenum_feature = origin_num_feature + count_feature_list + diff_feature_list + w2v_features
for i in cate_feature:data[i] = data[i].astype('category')for i in num_feature:data[i] = data[i].astype(float)
feature = cate_feature + num_feature
print(len(feature), feature)
训练模型
def f1_score_vali(preds, data_vali):labels = data_vali.get_label()preds = np.argmax(preds.reshape(11, -1), axis=0)score_vali = f1_score(y_true=labels, y_pred=preds, average='macro')return 'f1_score', score_vali ** 2, True
提取训练集和标签
X = data[(data.label != 0) & (data.label != 999999)][feature].reset_index(drop=True)y = data[(data.label != 0) & (data.label != 999999)].label.reset_index(drop=True)
套餐映射编码
label2current_service = dict(zip(range(0, len(set(y))), sorted(list(set(y)))))current_service2label = dict(zip(sorted(list(set(y))), range(0, len(set(y)))))label2current_servicecurrent_service2labely = pd.Series(y).map(current_service2label)params = {"learning_rate": 0.1,"boosting": 'gbdt',
"lambda_l2": 0.1,"max_depth": -1,"num_leaves": 128,"bagging_fraction": 0.8,"feature_fraction": 0.8,"max_bin": 1500,"metric": None,"objective": "multiclass","num_class": 11,"silent": True,"nthread": 10,"verbose": -1}cv_pred = [] # 测试集结果oof_pred = np.zeros(X.shape[0]) # 验证集结果skf = StratifiedKFold(n_splits=5, random_state=20181, shuffle=True)
for index, (train_index, test_index) in enumerate(skf.split(X, y)):print(index)
train_x, test_x, train_y, test_y = X.loc[train_index], X.loc[test_index], y.loc[train_index], y.loc[test_index] train_data = lgb.Dataset(train_x, label=train_y) valid_data = lgb.Dataset(test_x, label=test_y) clf = lgb.train(params, train_data, num_boost_round=2000, valid_sets=[valid_data], feval=f1_score_vali, verbose_eval=20, early_stopping_rounds=50) y_test = clf.predict(data[data.label == 0][feature], num_iteration=clf.best_iteration) y_test = [np.argmax(x) for x in y_test] oof_pred[train_index] = [np.argmax(x) for x in clf.predict(X.loc[train_index][feature], num_iteration=clf.best_iteration)] # 将K折进行拼接,讲数组按列顺序进行堆叠 if index == 0: cv_pred = np.array(y_test).reshape(-1, 1) else: cv_pred = np.hstack((cv_pred, np.array(y_test).reshape(-1, 1)))
submit = []for line in cv_pred:submit.append(np.argmax(np.bincount(line)))result = pd.DataFrame()result['user_id'] = data[data.label == 0]['user_id']result['predict'] = submitresult['predict'] = result['predict'].map(label2current_service)result.loc[result['user_id'] == '4VNcD6kE0sjnAvFX', 'predict'] = 999999
print(len(result), result.predict.value_counts())print(result.sort_values('user_id').head())result[['user_id', 'predict']].to_csv(path + '/sub.csv', index=False)
误差分析
对比每个类别的F1
lgb_model = lgb.LGBMClassifier(boosting_type="gbdt", num_leaves=32, reg_alpha=0, reg_lambda=0.,max_depth=-1, n_estimators=100, objective='multiclass', metric="None",subsample=0.9, colsample_bytree=0.5, subsample_freq=1,learning_rate=0.2, random_state=2018, n_jobs=10)lgb_model.fit(train_x, train_y, categorical_feature=cate_feature)print(lgb_model.best_score_)
score = f1_score(y_true=test_y, y_pred=lgb_model.predict(test_x), average=None)print(score)from sklearn.metrics import confusion_matrixconf_mx = confusion_matrix(test_y, y_pred=lgb_model.predict(test_x))plt.matshow(conf_mx, cmap=plt.cm.gray)pd.DataFrame(conf_mx)label2current_service根据混淆矩阵可以看到,在830只有2000多个样本时,有接近一半,被错误的分类到尾数为166的套餐(以下简称166)中,而166对830分错较少
观察 99999830 与 89950166的差异
sns.jointplot(x='age',y='1_total_fee',data = train[train.current_service==99999830])sns.jointplot(x='age',y='1_total_fee',data = train[train.current_service==89950166])sns.jointplot(x='age',y='month_traffic',data = train[train.current_service==99999830])sns.jointplot(x='age',y='month_traffic',data = train[train.current_service==89950166])train[train.current_service==99999830]['complaint_level'].hist(bins=70)train[train.current_service==89950166]['complaint_level'].hist(bins=70)再对830和166两类进行EDA,可以看到无论是流量,还是话费,两套餐都呈现出高度一致
所以选择单独对这两类套餐进行二分类,既将830数据视为正样本,166视为负样本对模型进行训练,预测最终模型结果中830套餐为166套餐的概率,并对概率0.5以下进行修改
迁移学习
import osimport pandas as pd
import lightgbm as lgbimport numpy as npfrom sklearn.metrics import f1_score
import warningswarnings.filterwarnings("ignore")
path = './data/'
w2v_path = path + 'w2v'train = pd.read_csv(path + 'train_2.csv')test = pd.read_csv(path + 'test_2.csv')
train_first = pd.read_csv(path + 'train_1.csv')train['data_type'] = 0test['data_type'] = 0train_first['data_type'] = 1data = pd.concat([train, test, train_first], ignore_index=True).fillna(0)data['label'] = data.current_service.astype(int)data = data.replace('\N', 999)data['gender'] = data.gender.astype(int)origin_cate_feature = ['service_type', 'complaint_level', 'contract_type', 'gender', 'is_mix_service','is_promise_low_consume','many_over_bill', 'net_service']
origin_num_feature = ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee','age', 'contract_time','former_complaint_fee', 'former_complaint_num','last_month_traffic', 'local_caller_time', 'local_trafffic_month', 'month_traffic','online_time', 'pay_num', 'pay_times', 'service1_caller_time', 'service2_caller_time']
for i in origin_num_feature:data[i] = data[i].astype(float)w2v_features = []for col in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']:df = pd.read_csv(w2v_path + '/' + col + '.csv')df = df.drop_duplicates([col])fs = list(df)fs.remove(col)w2v_features += fsdata = pd.merge(data, df, on=col, how='left')count_feature_list = []def feature_count(data, features=[]):if len(set(features)) != len(features):print('equal feature !!!!')return datanew_feature = 'count'for i in features:new_feature += '' + i.replace('add', '')try:del data[new_feature]except:passtemp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})
data = data.merge(temp, 'left', on=features) count_feature_list.append(new_feature) return data
data = feature_count(data, ['1_total_fee'])data = feature_count(data, ['2_total_fee'])data = feature_count(data, ['3_total_fee'])data = feature_count(data, ['4_total_fee'])
data = feature_count(data, ['former_complaint_fee'])
data = feature_count(data, ['pay_num'])data = feature_count(data, ['contract_time'])data = feature_count(data, ['last_month_traffic'])data = feature_count(data, ['online_time'])
for i in ['service_type', 'contract_type']:data = feature_count(data, [i, '1_total_fee'])data = feature_count(data, [i, '2_total_fee'])data = feature_count(data, [i, '3_total_fee'])data = feature_count(data, [i, '4_total_fee'])
data = feature_count(data, [i, 'former_complaint_fee']) data = feature_count(data, [i, 'pay_num']) data = feature_count(data, [i, 'contract_time']) data = feature_count(data, [i, 'last_month_traffic']) data = feature_count(data, [i, 'online_time'])
差值特征
diff_feature_list = ['diff_total_fee_1', 'diff_total_fee_2', 'diff_total_fee_3', 'last_month_traffic_rest','rest_traffic_ratio','total_fee_mean', 'total_fee_max', 'total_fee_min', 'total_caller_time', 'service2_caller_ratio','local_caller_ratio','total_month_traffic', 'month_traffic_ratio', 'last_month_traffic_ratio', 'pay_num_1_total_fee','1_total_fee_call_fee', '1_total_fee_call2_fee', '1_total_fee_trfc_fee']
data['diff_total_fee_1'] = data['1_total_fee'] - data['2_total_fee']data['diff_total_fee_2'] = data['2_total_fee'] - data['3_total_fee']data['diff_total_fee_3'] = data['3_total_fee'] - data['4_total_fee']
data['pay_num_1_total_fee'] = data['pay_num'] - data['1_total_fee']
data['last_month_traffic_rest'] = data['month_traffic'] - data['last_month_traffic']data['last_month_traffic_rest'][data['last_month_traffic_rest'] < 0] = 0data['rest_traffic_ratio'] = (data['last_month_traffic_rest'] * 15 / 1024) / data['1_total_fee']
total_fee = []for i in range(1, 5):total_fee.append(str(i) + '_total_fee')data['total_fee_mean'] = data[total_fee].mean(1)data['total_fee_max'] = data[total_fee].max(1)data['total_fee_min'] = data[total_fee].min(1)
data['total_caller_time'] = data['service2_caller_time'] + data['service1_caller_time']data['service2_caller_ratio'] = data['service2_caller_time'] / data['total_caller_time']data['local_caller_ratio'] = data['local_caller_time'] / data['total_caller_time']
data['total_month_traffic'] = data['local_trafffic_month'] + data['month_traffic']data['month_traffic_ratio'] = data['month_traffic'] / data['total_month_traffic']data['last_month_traffic_ratio'] = data['last_month_traffic'] / data['total_month_traffic']
data['1_total_fee_call_fee'] = data['1_total_fee'] - data['service1_caller_time'] * 0.15data['1_total_fee_call2_fee'] = data['1_total_fee'] - data['service2_caller_time'] * 0.15data['1_total_fee_trfc_fee'] = data['1_total_fee'] - (data['month_traffic'] - 2 * data['last_month_traffic']) * 0.3
data.loc[data.service_type == 1, '1_total_fee_trfc_fee'] = None
cate_feature = origin_cate_featurenum_feature = origin_num_feature + count_feature_list + diff_feature_list + w2v_featuresfor i in cate_feature:data[i] = data[i].astype('category')for i in num_feature:data[i] = data[i].astype(float)
feature = cate_feature + num_feature
print(len(feature), feature)
data = data[data.label != 999999]train_x = data[(data.data_type == 1)][feature]train_y = data[(data.data_type == 1)].label
test_x = data[(data.data_type == 0) & (data.label != 0)][feature]test_y = data[(data.data_type == 0) & (data.label != 0)].labellgb_model = lgb.LGBMClassifier(boosting_type="gbdt", num_leaves=120, reg_alpha=0, reg_lambda=0.,max_depth=-1, n_estimators=2500, objective='multiclass', metric="None",subsample=0.9, colsample_bytree=0.5, subsample_freq=1,learning_rate=0.035, random_state=2018, n_jobs=10)lgb_model.fit(train_x, train_y, categorical_feature=cate_feature)print(lgb_model.best_score_)
stacking_path = path + 'stack/'if not os.path.exists(stacking_path):print(stacking_path)os.makedirs(stacking_path)train_proba = lgb_model.predict_proba(test_x[feature])test_proba = lgb_model.predict_proba(data[data.label == 0][feature])print(len(train_proba), len(test_proba))stacking_train = data[(data.data_type == 0) & (data.label != 0)][['user_id']]stacking_test = data[data.label == 0][['user_id']]for i in range(11):stacking_train['stacking_' + str(i)] = train_proba[:, i]stacking_test['stacking_' + str(i)] = test_proba[:, i]stacking_train.to_csv(stacking_path + 'train.csv', index=False)stacking_test.to_csv(stacking_path + 'test.csv', index=False)
score = f1_score(y_true=test_y, y_pred=lgb_model.predict(test_x), average=None)print(score)