标签:transaction lightGBM NaN 诈骗 案例 shape train test identity
一、数据预处理
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import lightgbm as lgb
from tqdm import tqdm_notebook
import joblib
import warnings
warnings.filterwarnings("ignore")
%%time
# 加载训练和测试数据集
train_transaction = pd.read_csv("train_transaction.csv")
print("train_transaction shape : ", train_transaction.shape)
train_identity = pd.read_csv("train_identity.csv")
print("train_identity shape : ", train_identity.shape)
test_transaction = pd.read_csv("test_transaction.csv")
print("test_transaction shape : ", test_transaction.shape)
test_identity = pd.read_csv("test_identity.csv")
print("test_identity shape : ", test_identity.shape)
train_transaction shape : (590540, 394)
train_identity shape : (144233, 41)
test_transaction shape : (506691, 393)
test_identity shape : (141907, 41)
Wall time: 21.5 s
# 默认显示前5行
train_transaction.head()
TransactionID | isFraud | TransactionDT | TransactionAmt | ProductCD | card1 | card2 | card3 | card4 | card5 | ... | V330 | V331 | V332 | V333 | V334 | V335 | V336 | V337 | V338 | V339 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2987000 | 0 | 86400 | 68.5 | W | 13926 | NaN | 150.0 | discover | 142.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 2987001 | 0 | 86401 | 29.0 | W | 2755 | 404.0 | 150.0 | mastercard | 102.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 2987002 | 0 | 86469 | 59.0 | W | 4663 | 490.0 | 150.0 | visa | 166.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 2987003 | 0 | 86499 | 50.0 | W | 18132 | 567.0 | 150.0 | mastercard | 117.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 2987004 | 0 | 86506 | 50.0 | H | 4497 | 514.0 | 150.0 | mastercard | 102.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 394 columns
# 查看数据信息
train_transaction.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 394 entries, TransactionID to V339
dtypes: float64(376), int64(4), object(14)
memory usage: 1.7+ GB
# 统计数据中的NaN
train_transaction.isnull().sum()
TransactionID 0
isFraud 0
TransactionDT 0
TransactionAmt 0
ProductCD 0
...
V335 508189
V336 508189
V337 508189
V338 508189
V339 508189
Length: 394, dtype: int64
同样的方式查看train_identity、test_transaction、test_identity的数据类型、数据信息和空值情况
根据 TransactionID 合并 train_transaction 和 train_identity,test_transaction 和 test_identity
# 根据以上数据的shape,可以发现,并非所有的transaction都有相关联的identity
# 计算一下占比
train_count = np.sum(train_transaction['TransactionID'].isin(train_identity['TransactionID']))
print("train_transaction与train_identity基于TransactionID相关联的数据量 : ", train_count)
train_ratio = train_count / len(train_transaction)
print("相关联数据量占整个train_transaction数据量的比例是 : {:.2f}%".format(train_ratio * 100))
train_transaction与train_identity基于TransactionID相关联的数据量 : 144233
相关联数据量占整个train_transaction数据量的比例是 : 24.42%
test_count = np.sum(test_transaction['TransactionID'].isin(test_identity['TransactionID']))
print("test_transaction与test_identity基于TransactionID相关联的数据量 : ", test_count)
test_ratio = test_count / len(test_transaction)
print("相关联数据量占整个test_transaction数据量的比例是 : {:.2f}%".format(test_ratio * 100))
est_transaction与test_identity基于TransactionID相关联的数据量 : 141907
相关联数据量占整个test_transaction数据量的比例是 : 28.01%
# 合并
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
train.shape:(590540, 434)
test.shape:(506691, 433)
训练数据比测试数据多一个标签特征
二、数据探索分析
在train_transaction中,正负样本比例
sns.countplot('isFraud', data=train)
plt.title("Normal VS Fraud")
plt.show()
交易金额分布
train['TransactionAmt'].apply(np.log).plot(kind='hist', bins=100, figsize=(15, 5), title='Distribution of Transaction Amount')
交易金额(正常 vs 欺诈)分布
为了展示更为
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 6)) # 画布大小
train.loc[train['isFraud']==1]['TransactionAmt'].apply(np.log).plot(kind='hist',
bins=100,
title='Log Transaction Amount isFraud = 1',
ax=ax1)
train.loc[train['isFraud']==1]['TransactionAmt'].plot(kind='hist',
bins=100,
title='Transaction Amount isFraud=1',
ax=ax2)
train.loc[train['isFraud']==0]['TransactionAmt'].apply(np.log).plot(kind='hist',
bins=100,
title='Log Transaction Amount isFraud = 0',
ax=ax3)
train.loc[train['isFraud']==0]['TransactionAmt'].plot(kind='hist',
bins=100,
title='Transaction Amount isFraud=0',
ax=ax4)
plt.show()
分析 ProductCD
train.groupby('ProductCD')['TransactionID'].count().plot(kind='barh',
figsize=(15, 6),
title='ProductCD TransctionID')
plt.show()
train.groupby('ProductCD')['isFraud'].mean().plot(kind='barh',
figsize=(15, 6),
title='ProductCD isFraud')
plt.show()
分类变量进行转换 LabelEncoder
LabelEncoder :将离散型的数据转换成 0 到 n − 1 之间的数,这里 n 是一个列表的不同取值的个数,可以认为是某个特征的所有不同取值的个数。
# 输出 object 类型的列
for col in train.columns:
if train[col].dtype == "object":
print(col)
ProductCD
card4
card6
P_emaildomain
R_emaildomain
M1
M2
M3
M4
M5
M6
M7
M8
M9
id_12
id_15
id_16
id_23
id_27
id_28
id_29
id_30
id_31
id_33
id_34
id_35
id_36
id_37
id_38
DeviceType
DeviceInfo
# 输出 DeviceType 的分布
train.groupby("DeviceType").mean()["isFraud"].plot(kind='barh',
figsize=(15, 5),
title = "DeviceType Distribution")
plt.show()
统计每一列的NaN的数量,如果比例超过70%,则删除该列
def clean_nan(df):
temp_columns = []
for col in df.columns:
# 获取一列
counter = df[col].isnull().sum()
# 占该列的比例
ratio = counter / len(df[col])
if ratio > 0.7:
# 删除该列
temp_columns.append(col)
# 删除那些列
new_df = df.drop(temp_columns, axis=1)
return new_df
print("原始的train shape : ", train.shape)
train = clean_nan(train)
print("清洗后的train shape : ", train.shape)
原始的train shape : (590540, 434)
清洗后的train shape : (590540, 226)
训练数据清洗掉208个特征
print("原始的test shape : ", test.shape)
test = clean_nan(test)
print("清洗后的test shape : ", test.shape)
原始的test shape : (506691, 433)
清洗后的test shape : (506691, 225)
测试数据清洗掉208个特征
删除那些一列中某个类别数据量超过90%的列
def clean_top_cols(df):
new_cols = [col for col in df.columns if df[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
return new_cols
train_cols = clean_top_cols(train)
test_cols = clean_top_cols(test)
cols_to_drop = list(set(train_cols + test_cols)) # 使用set列表进行去重
'isFraud' in cols_to_drop # 查看标签列是否在其中
cols_to_drop.remove('isFraud') # 删除标签列
# 清理掉这些列
print("原始的train shape : ", train.shape)
train = train.drop(cols_to_drop, axis=1)
print("清理后的train shape : ", train.shape)
原始的train shape : (590540, 226)
清理后的train shape : (590540, 156)
# 清理掉这些列
print("原始的test shape : ", test.shape)
test = test.drop(cols_to_drop, axis=1)
print("清理后的test shape : ", test.shape)
原始的test shape : (506691, 225)
清理后的test shape : (506691, 155)
# LabelEncoder
# 加上进度条:tqdm_notebook
for col in tqdm_notebook(train.columns):
if train[col].dtype == "object":
encoder = LabelEncoder()
encoder.fit(list(train[col].values) + list(test[col].values))
train[col] = encoder.transform(list(train[col].values))
test[col] = encoder.transform(list(test[col].values))
train = train.reset_index() # 重置索引
test = test.reset_index()
del train['index']
del test['index']
train.shape:(590540, 156)
test.shape:(506691, 155)
# 分离数据集和标签
X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)
y = train.sort_values('TransactionDT')['isFraud']
X.shape:(590540, 153)
y.shape:(590540,)
# test
X_test = test.drop(['TransactionDT', 'TransactionID'], axis=1) # 删除多余的列,不是特征
test = test[['TransactionDT', 'TransactionID']]
X_test.shape:(506691, 153)
test.shape:(506691, 2)
三、LightGBM建模
params 超参数设置
params = {'num_leaves': 491, # 叶节点数量
'min_data_in_leaf': 106, # 每个叶子节点中的数据
'objective': 'binary', # 任务:二分类
'max_depth': -1, # -1 : 不限制深度
"boosting_type": "gbdt", # 'dart', 'goss', 'rf'
"metric": 'auc', # 衡量标准
"verbosity" : -1, # 不显示信息
'random_state': 66, # 随机种子
}
创建DataFrame保存特征重要性
feature_importances = pd.DataFrame(index=None)
feature_importances['features'] = X.columns
feature_importances
features | |
---|---|
0 | TransactionAmt |
1 | ProductCD |
2 | card1 |
3 | card2 |
4 | card3 |
... | ... |
148 | V312 |
149 | V313 |
150 | V314 |
151 | V315 |
152 | V317 |
153 rows × 1 columns
5折交叉验证
folds = KFold(n_splits=5)
splits = folds.split(X, y) # 分割成5份,前4份是训练集索引,最后1份是验证集索引
next(iter(splits)) # 输出的是索引
(array([118108, 118109, 118110, ..., 590537, 590538, 590539]),
array([ 0, 1, 2, ..., 118105, 118106, 118107]))
best_auc = 0
best_model = None
for k, (train_indices, val_indices) in enumerate(splits):
print("第 %d 折\n" % k)
X_train_data, X_val_data = X.iloc[train_indices], X.iloc[val_indices] # 训练集, 验证集
y_train, y_val = y.iloc[train_indices], y.iloc[val_indices] # 训练标签,验证标签
#print("X_train_data shape : \t", X_train_data.shape, "X_val_data shape : \t", X_val_data.shape)
train_dataset = lgb.Dataset(X_train_data, label=y_train) # 训练集
val_dataset = lgb.Dataset(X_val_data, label=y_val) # 验证集
lgb_model = lgb.train(params=params, # 超参数设置
train_set=train_dataset, # 训练数据
num_boost_round=10000, # 循环的轮数
valid_sets=val_dataset, # 验证数据
valid_names='validation', # 验证集名称
early_stopping_rounds=200,) # 如果200轮后没有提升,就停止循环
# 保存特征重要性
feature_importances[f'fold_{k+1}'] = lgb_model.feature_importance()
# 对验证集进行预测
y_val_pred = lgb_model.predict(X_val_data)
# 计算roc_auc
roc_auc = roc_auc_score(y_val, y_val_pred)
print(f" Fold {k + 1} | AUC_ROC : { roc_auc * 100}%")
# 判断是否是最优模型
if roc_auc > best_auc:
best_auc = roc_auc
best_model = lgb_model
feature_importances['average'] = feature_importances[[f'fold_{k+1}' for k in range(folds.n_splits-1)]].mean(axis=1)
feature_importances.head()
features | fold_1 | fold_2 | fold_3 | fold_4 | average | |
---|---|---|---|---|---|---|
0 | TransactionAmt | 10758 | 2337 | 2571 | 3088 | 4688.5 |
1 | ProductCD | 478 | 148 | 189 | 179 | 248.5 |
2 | card1 | 10241 | 2791 | 3168 | 3450 | 4912.5 |
3 | card2 | 8222 | 2517 | 2606 | 3039 | 4096.0 |
4 | card3 | 559 | 214 | 263 | 310 | 336.5 |
# 可视化显示前50个特征
plt.figure(figsize=(16, 16))
sns.barplot(data=feature_importances.sort_values(by='average', ascending=False).head(50),
x='average',
y='features')
plt.title("50 top features importance over {} folds average.".format(folds.n_splits-1))
print("The best roc_auc : ", roc_auc)
The best roc_auc : 0.9187823659441293
joblib.dump(best_model, "best_model.pkl") # 保存模型
对与test进行预测
y_test_pred = best_model.predict(X_test)
labels = np.round(y_test_pred) # np.round() 四舍五入
from collections import Counter
Counter(labels)
Counter({0.0: 498254, 1.0: 8437})
标签:transaction,lightGBM,NaN,诈骗,案例,shape,train,test,identity 来源: https://www.cnblogs.com/wkfvawl/p/16629316.html
本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享; 2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关; 3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关; 4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除; 5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。