Deep and Cross Network原理及实现

Deep & Cross Network for Ad Click Predictions,是四位在谷歌的中国人放出的一篇文章。题目也一目了然,是用复杂网络结构做CTR预估的,与2016年谷歌的wide and deep model非常相似,于是利用课余时间梳理并简单的实现了一下。

原理

Input

我们从wide and deep model切入,不论是wide侧还是deep侧,在输入端为embedding feature + cross feature + continous_feature。而这里就会存在一个小问题,在构造cross feature时,我们依然需要依靠hand-craft来生成特征,那么哪些分类特征做cross,连续特征如何做cross,都是需要解决的小麻烦。而来到了deep and cross model,则免去了这些麻烦,因为在输入层面,只有embedding column + continuous column,feature cross的概念都在网络结构中去实现的。

Cross Network

将输入的embedding column + continous column定义为$x_0$($x_0 \in R^d$),第$l+1$层的cross layer为

其中$w_l(w_l \in R^d)$和$b_l(b_l \in R^d)$为第$l$层的参数。这么看来,Cross这部分网络的总参数量非常少,仅仅为$layers * d * 2$,每一层的维度也都保持一致,最后的output依然与input维度相等。另一方面,特征交叉的概念体现在每一层,当前层的输出的higher-represented特征都要与第一层输入的原始特征做一次两两交叉。至于为什么要再最后又把$x_l$给加上,我想是借鉴了ResNet的思想,最终模型要去拟合的是$x_{l+1} - x_{l}$这一项残差。

Deep Network

这部分就不多说了,和传统DNN一样,input进来,简单的N层full-connected layer的叠加,所以参数量主要还是在deep侧。

Output

Cross layer和Deep layer出来的输出做一次concat,对于多分类问题,过一个softmax就OK了。

实现

在实现上主要利用了Keras的Functional API Model,可以比较方便的自定义layer,数据集利用了文章中提到的Forest coverType数据,layer数及nueron数也都根据文章中写死,但有所不同的是文章中貌似没有对categorical feature做embedding ,而数据集中有一个cardinality为40的categorical feature,所以代码里对这个变量做了embedding,embedding维度也按文章中的公式指定。其他超参也所剩无几,几乎都是些weights的初始化方法。最后放上生成的网络结构及代码。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import numpy as np
import pandas as pd
import keras.backend as K
from keras import layers
from keras.layers import Dense
from keras.optimizers import Adam
from keras.layers import Input, Embedding, Reshape, Add
from keras.layers import Flatten, merge, Lambda
from keras.models import Model
from keras.utils.vis_utils import plot_model
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score
import random
# similar to https://github.com/jrzaurin/Wide-and-Deep-Keras/blob/master/wide_and_deep_keras.py
def feature_generate(data):
data, label, cate_columns, cont_columns = preprocessing(data)
embeddings_tensors = []
continuous_tensors = []
for ec in cate_columns:
layer_name = ec + '_inp'
# For categorical features, we em-bed the features in dense vectors of dimension 6×(category cardinality)**(1/4)
embed_dim = data[ec].nunique() if int(6 * np.power(data[ec].nunique(), 1/4)) > data[ec].nunique() \
else int(6 * np.power(data[ec].nunique(), 1/4))
t_inp, t_build = embedding_input(layer_name, data[ec].nunique(), embed_dim)
embeddings_tensors.append((t_inp, t_build))
del(t_inp, t_build)
for cc in cont_columns:
layer_name = cc + '_in'
t_inp, t_build = continous_input(layer_name)
continuous_tensors.append((t_inp, t_build))
del(t_inp, t_build)
inp_layer = [et[0] for et in embeddings_tensors]
inp_layer += [ct[0] for ct in continuous_tensors]
inp_embed = [et[1] for et in embeddings_tensors]
inp_embed += [ct[1] for ct in continuous_tensors]
return data, label, inp_layer, inp_embed
def embedding_input(name, n_in, n_out):
inp = Input(shape = (1, ), dtype = 'int64', name = name)
return inp, Embedding(n_in, n_out, input_length = 1)(inp)
def continous_input(name):
inp = Input(shape=(1, ), dtype = 'float32', name = name)
return inp, Reshape((1, 1))(inp)
# The optimal hyperparameter settings were 8 cross layers of size 54 and 6 deep layers of size 292 for DCN
# Embed "Soil_Type" column (embedding dim == 15), we have 8 cross layers of size 29
def fit(inp_layer, inp_embed, X, y):
#inp_layer, inp_embed = feature_generate(X, cate_columns, cont_columns)
input = merge(inp_embed, mode = 'concat')
# deep layer
for i in range(6):
if i == 0:
deep = Dense(272, activation='relu')(Flatten()(input))
else:
deep = Dense(272, activation='relu')(deep)
# cross layer
cross = CrossLayer(output_dim = input.shape[2].value, num_layer = 8, name = "cross_layer")(input)
#concat both layers
output = merge([deep, cross], mode = 'concat')
output = Dense(y.shape[1], activation = 'softmax')(output)
model = Model(inp_layer, output)
print(model.summary())
plot_model(model, to_file = 'model.png', show_shapes = True)
model.compile(Adam(0.01), loss = 'categorical_crossentropy', metrics = ["accuracy"])
model.fit([X[c] for c in X.columns], y, batch_size = 256, epochs = 10)
return model
def evaluate(X, y, model):
y_pred = model.predict([X[c] for c in X.columns])
acc = np.sum(np.argmax(y_pred, 1) == np.argmax(y, 1)) / y.shape[0]
print("Accuracy: ", acc)
# https://keras.io/layers/writing-your-own-keras-layers/
class CrossLayer(layers.Layer):
def __init__(self, output_dim, num_layer, **kwargs):
self.output_dim = output_dim
self.num_layer = num_layer
super(CrossLayer, self).__init__(**kwargs)
def build(self, input_shape):
self.input_dim = input_shape[2]
self.W = []
self.bias = []
for i in range(self.num_layer):
self.W.append(self.add_weight(shape = [1, self.input_dim], initializer = 'glorot_uniform', name = 'w_' + str(i), trainable = True))
self.bias.append(self.add_weight(shape = [1, self.input_dim], initializer = 'zeros', name = 'b_' + str(i), trainable = True))
self.built = True
def call(self, input):
for i in range(self.num_layer):
if i == 0:
cross = Lambda(lambda x: Add()([K.sum(self.W[i] * K.batch_dot(K.reshape(x, (-1, self.input_dim, 1)), x), 1, keepdims = True), self.bias[i], x]))(input)
else:
cross = Lambda(lambda x: Add()([K.sum(self.W[i] * K.batch_dot(K.reshape(x, (-1, self.input_dim, 1)), input), 1, keepdims = True), self.bias[i], input]))(cross)
return Flatten()(cross)
def compute_output_shape(self, input_shape):
return (None, self.output_dim)
# modify the embedding columns here
def preprocessing(data):
# inverse transform one-hot to continuous column
df_onehot = data[[col for col in data.columns.tolist() if "Soil_Type" in col]]
#for i in df_onehot.columns.tolist():
# if df_onehot[i].sum() == 0:
# del df_onehot[i]
data["Soil"] = df_onehot.dot(np.array(range(df_onehot.columns.size))).astype(int)
data.drop([col for col in data.columns.tolist() if "Soil_Type" in col], axis = 1, inplace = True)
label = np.array(OneHotEncoder().fit_transform(data["Cover_Type"].reshape(-1, 1)).todense())
del data["Cover_Type"]
cate_columns = ["Soil"]
cont_columns = [col for col in data.columns if col != "Soil"]
# Feature normilization
scaler = StandardScaler()
data_cont = pd.DataFrame(scaler.fit_transform(data[cont_columns]), columns = cont_columns)
data_cate = data[cate_columns]
data = pd.concat([data_cate, data_cont], axis = 1)
return data, label, cate_columns, cont_columns
if __name__ == "__main__":
# data download from https://www.kaggle.com/uciml/forest-cover-type-dataset/data
data = pd.read_csv("./data/covtype.csv")
X, y, inp_layer, inp_embed = feature_generate(data)
#random split train and test by 9:1
train_index = random.sample(range(X.shape[0]), int(X.shape[0] * 0.9))
test_index = list(set(range(X.shape[0])) - set(train_index))
model = fit(inp_layer, inp_embed, X.iloc[train_index], y[train_index, :])
evaluate(X.iloc[test_index], y[test_index, :], model)