diff --git a/deepmatch/models/dat.py b/deepmatch/models/dat.py new file mode 100644 index 0000000..290caea --- /dev/null +++ b/deepmatch/models/dat.py @@ -0,0 +1,94 @@ +""" +Author: + Yang Bo, 469828263@qq.com +Reference: + Yantao Yu, Weipeng Wang, Zhoutian Feng, Daiyue Xue, et al. A Dual Augumented Two-tower Model for Online Large-scale Recommendation. DLP-KDD 2021. +""" + +from deepctr.feature_column import build_input_features, create_embedding_matrix +from deepctr.layers import PredictionLayer, DNN, combined_dnn_input +from deepctr.layers.utils import Hash +from tensorflow.python.keras.models import Model +from tensorflow.keras.layers import Input, Embedding, Flatten +from tensorflow.keras.regularizers import l2 +from tensorflow.keras import backend as K + +from ..inputs import input_from_feature_columns +from ..layers.core import Similarity + +def generate_augmented_embedding(feat, l2_reg_embedding=1e-6): + inp = Input(shape=(1,), name='aug_inp_' + feat.name, dtype=feat.dtype) + if feat.use_hash: + lookup_idx = Hash(feat.vocabulary_size, mask_zero=False, vocabulary_path=feat.vocabulary_path)(inp) + else: + lookup_idx = inp + emb = Embedding(feat.vocabulary_size, feat.embedding_dim, + embeddings_initializer=feat.embeddings_initializer, + embeddings_regularizer=l2(l2_reg_embedding), + name='aug_emb_' + feat.embedding_name) + emb.trainable = feat.trainable + return inp, Flatten()(emb(lookup_idx)) + +def DAT(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(64, 32), + item_dnn_hidden_units=(64, 32), + dnn_activation='tanh', dnn_use_bn=False, + l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, seed=1024, metric='cos'): + """Instantiates the Deep Structured Semantic Model architecture. + + :param user_feature_columns: An iterable containing user's features used by the model. + :param item_feature_columns: An iterable containing item's features used by the model. + :param user_dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of user tower + :param item_dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of item tower + :param dnn_activation: Activation function to use in deep net + :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in deep net + :param l2_reg_dnn: float. L2 regularizer strength applied to DNN + :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector + :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate. + :param seed: integer ,to use as random seed. + :param metric: str, ``"cos"`` for cosine or ``"ip"`` for inner product + :return: A Keras model instance. + + """ + + embedding_matrix_dict = create_embedding_matrix(user_feature_columns + item_feature_columns, l2_reg_embedding, + seed=seed, + seq_mask_zero=True) + + user_features = build_input_features(user_feature_columns) + user_sparse_embedding_list, user_dense_value_list = input_from_feature_columns(user_features, + user_feature_columns, + l2_reg_embedding, seed=seed, + embedding_matrix_dict=embedding_matrix_dict) + i_u, a_u = generate_augmented_embedding(user_feature_columns[0]) + user_inputs_list = list(user_features.values()) + [i_u] + user_dnn_input = combined_dnn_input(user_sparse_embedding_list, [a_u]) + + item_features = build_input_features(item_feature_columns) + item_sparse_embedding_list, item_dense_value_list = input_from_feature_columns(item_features, + item_feature_columns, + l2_reg_embedding, seed=seed, + embedding_matrix_dict=embedding_matrix_dict) + i_v, a_v = generate_augmented_embedding(item_feature_columns[0]) + item_inputs_list = list(item_features.values()) + [i_v] + item_dnn_input = combined_dnn_input(item_sparse_embedding_list, [a_v]) + + user_dnn_out = DNN(user_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, + dnn_use_bn, seed=seed)(user_dnn_input) + + item_dnn_out = DNN(item_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, + dnn_use_bn, seed=seed)(item_dnn_input) + + score = Similarity(type=metric)([user_dnn_out, item_dnn_out]) + + output = PredictionLayer("binary", False)(score) + + model = Model(inputs=user_inputs_list + item_inputs_list, outputs=output) + + model.__setattr__("user_input", user_inputs_list) + model.__setattr__("item_input", item_inputs_list) + model.__setattr__("user_embedding", user_dnn_out) + model.__setattr__("item_embedding", item_dnn_out) + + a_u_l = K.stop_gradient(a_u) + a_v_l = K.stop_gradient(a_v) + return model, output, user_dnn_out, item_dnn_out, a_u_l, a_v_l diff --git a/deepmatch/models/deepfm.py b/deepmatch/models/deepfm.py new file mode 100644 index 0000000..e75ef6a --- /dev/null +++ b/deepmatch/models/deepfm.py @@ -0,0 +1,84 @@ +""" +Author: + Yang Bo, 469828263@qq.com +Reference: + Guo H, Tang R, Ye Y, et al. Deepfm: a factorization-machine based neural network for ctr prediction[J]. arXiv preprint arXiv:1703.04247, 2017.(https://arxiv.org/abs/1703.04247) +""" + +from deepctr.feature_column import build_input_features, create_embedding_matrix +from deepctr.layers import PredictionLayer, DNN, combined_dnn_input +from tensorflow.python.keras.models import Model + +from ..inputs import input_from_feature_columns +from ..layers.core import Similarity +import tensorflow as tf + + +def DeepFM(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(64, 32), + item_dnn_hidden_units=(64, 32), + dnn_activation='tanh', dnn_use_bn=False, + l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, seed=1024, metric='cos'): + """Instantiates the Deep Structured Semantic Model architecture. + + :param user_feature_columns: An iterable containing user's features used by the model. + :param item_feature_columns: An iterable containing item's features used by the model. + :param user_dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of user tower + :param item_dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of item tower + :param dnn_activation: Activation function to use in deep net + :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in deep net + :param l2_reg_dnn: float. L2 regularizer strength applied to DNN + :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector + :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate. + :param seed: integer ,to use as random seed. + :param metric: str, ``"cos"`` for cosine or ``"ip"`` for inner product + :return: A Keras model instance. + + """ + + embedding_matrix_dict = create_embedding_matrix(user_feature_columns + item_feature_columns, l2_reg_embedding, + seed=seed, + seq_mask_zero=True) + + user_features = build_input_features(user_feature_columns) + user_inputs_list = list(user_features.values()) + user_sparse_embedding_list, user_dense_value_list = input_from_feature_columns(user_features, + user_feature_columns, + l2_reg_embedding, seed=seed, + embedding_matrix_dict=embedding_matrix_dict) + user_dnn_input = combined_dnn_input(user_sparse_embedding_list, user_dense_value_list) + + item_features = build_input_features(item_feature_columns) + item_inputs_list = list(item_features.values()) + item_sparse_embedding_list, item_dense_value_list = input_from_feature_columns(item_features, + item_feature_columns, + l2_reg_embedding, seed=seed, + embedding_matrix_dict=embedding_matrix_dict) + item_dnn_input = combined_dnn_input(item_sparse_embedding_list, item_dense_value_list) + + user_dnn_out = DNN(user_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, + dnn_use_bn, seed=seed)(user_dnn_input) + user_dnn_norm = tf.keras.layers.LayerNormalization(axis=1)(user_dnn_out) + user_fm_out = tf.reduce_sum(tf.concat(user_sparse_embedding_list, axis=1), axis=1) + user_fm_norm = tf.keras.layers.LayerNormalization(axis=1)(user_fm_out) + + item_dnn_out = DNN(item_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, + dnn_use_bn, seed=seed)(item_dnn_input) + item_dnn_norm = tf.keras.layers.LayerNormalization(axis=1)(item_dnn_out) + item_fm_out = tf.reduce_sum(tf.concat(item_sparse_embedding_list, axis=1), axis=1) + item_fm_norm = tf.keras.layers.LayerNormalization(axis=1)(item_fm_out) + + user_emb_out = tf.concat([user_dnn_norm, user_fm_norm], axis=1) + item_emb_out = tf.concat([item_dnn_norm, item_fm_norm], axis=1) + + score = Similarity(type=metric)([user_emb_out, item_emb_out]) + + output = PredictionLayer("binary", False)(score) + + model = Model(inputs=user_inputs_list + item_inputs_list, outputs=output) + + model.__setattr__("user_input", user_inputs_list) + model.__setattr__("item_input", item_inputs_list) + model.__setattr__("user_embedding", user_emb_out) + model.__setattr__("item_embedding", item_emb_out) + + return model diff --git a/examples/run_dat.py b/examples/run_dat.py new file mode 100644 index 0000000..81967fc --- /dev/null +++ b/examples/run_dat.py @@ -0,0 +1,123 @@ +import pandas as pd +from deepctr.feature_column import SparseFeat, VarLenSparseFeat +from preprocess import gen_data_set, gen_model_input +from sklearn.preprocessing import LabelEncoder +from tensorflow.python.keras.models import Model + +from deepmatch.models import * +import tensorflow.keras.backend as K +import tensorflow as tf + +from tensorflow.python.framework.ops import disable_eager_execution +disable_eager_execution() # using for custom loss + +def dual_augmented_loss(p_u, p_v, a_u, a_v): + def loss(y_true, y_pred): + y_ = K.cast(y_true, tf.float32) + loss_p = K.mean(K.square(y_ - y_pred)) + loss_u = K.mean(K.square(y_ * a_u + (1 - y_) * p_v - p_v)) + loss_v = K.mean(K.square(y_ * a_v + (1 - y_) * p_u - p_u)) + return loss_p + 0.5 * loss_u + 0.5 * loss_v + return loss + +if __name__ == "__main__": + + data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt") + sparse_features = ["movie_id", "user_id", + "gender", "age", "occupation", "zip", ] + SEQ_LEN = 50 + negsample = 3 + + # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` + + features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip'] + feature_max_idx = {} + for feature in features: + lbe = LabelEncoder() + data[feature] = lbe.fit_transform(data[feature]) + 1 + feature_max_idx[feature] = data[feature].max() + 1 + + user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id') + + item_profile = data[["movie_id"]].drop_duplicates('movie_id') + + user_profile.set_index("user_id", inplace=True) + + user_item_list = data.groupby("user_id")['movie_id'].apply(list) + + train_set, test_set = gen_data_set(data, negsample) + + train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) + test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) + train_model_input['aug_inp_user_id'] = train_model_input['user_id'] + train_model_input['aug_inp_movie_id'] = train_model_input['movie_id'] + test_model_input['aug_inp_user_id'] = test_model_input['user_id'] + test_model_input['aug_inp_movie_id'] = test_model_input['movie_id'] + + # 2.count #unique features for each sparse field and generate feature config for sequence feature + + embedding_dim = 8 + + user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim), + SparseFeat("gender", feature_max_idx['gender'], embedding_dim), + SparseFeat("age", feature_max_idx['age'], embedding_dim), + SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim), + SparseFeat("zip", feature_max_idx['zip'], embedding_dim), + VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim, + embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'), + ] + + item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)] + + # 3.Define Model and train + + user_dnn_hidden_units=(32, 16, 8) + item_dnn_hidden_units=(32, 16, 8) + model, y_pred, p_u, p_v, a_u, a_v = DAT(user_feature_columns, item_feature_columns, user_dnn_hidden_units=user_dnn_hidden_units, item_dnn_hidden_units=item_dnn_hidden_units) # FM(user_feature_columns,item_feature_columns) + + model.compile(optimizer='adagrad', loss=dual_augmented_loss(p_u, p_v, a_u, a_v)) + + history = model.fit(train_model_input, train_label, # train_label, + batch_size=256, epochs=1, verbose=1, validation_split=0.0, ) + + # 4. Generate user features for testing and full item features for retrieval + test_user_model_input = test_model_input + all_item_model_input = {"movie_id": item_profile['movie_id'].values, "aug_inp_movie_id": item_profile['movie_id'].values} + + user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding) + item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding) + + user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12) + item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12) + + print(user_embs.shape) + print(item_embs.shape) + + # 5. [Optional] ANN search by faiss and evaluate the result + + # test_true_label = {line[0]:[line[2]] for line in test_set} + # + # import numpy as np + # import faiss + # from tqdm import tqdm + # from deepmatch.utils import recall_N + # + # index = faiss.IndexFlatIP(user_dnn_hidden_units[-1]) + # # faiss.normalize_L2(item_embs) + # index.add(item_embs) + # # faiss.normalize_L2(user_embs) + # D, I = index.search(user_embs, 50) + # s = [] + # hit = 0 + # for i, uid in tqdm(enumerate(test_user_model_input['user_id'])): + # try: + # pred = [item_profile['movie_id'].values[x] for x in I[i]] + # filter_item = None + # recall_score = recall_N(test_true_label[uid], pred, N=50) + # s.append(recall_score) + # if test_true_label[uid] in pred: + # hit += 1 + # except: + # print(i) + # print("recall", np.mean(s)) + # print("hr", hit / len(test_user_model_input['user_id'])) diff --git a/examples/run_deepfm_negsampling.py b/examples/run_deepfm_negsampling.py new file mode 100644 index 0000000..7d00819 --- /dev/null +++ b/examples/run_deepfm_negsampling.py @@ -0,0 +1,105 @@ +import pandas as pd +from deepctr.feature_column import SparseFeat, VarLenSparseFeat +from preprocess import gen_data_set, gen_model_input +from sklearn.preprocessing import LabelEncoder +from tensorflow.python.keras.models import Model + +from deepmatch.models import * + +if __name__ == "__main__": + + data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt") + sparse_features = ["movie_id", "user_id", + "gender", "age", "occupation", "zip", ] + SEQ_LEN = 50 + negsample = 3 + + # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` + + features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip'] + feature_max_idx = {} + for feature in features: + lbe = LabelEncoder() + data[feature] = lbe.fit_transform(data[feature]) + 1 + feature_max_idx[feature] = data[feature].max() + 1 + + user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id') + + item_profile = data[["movie_id"]].drop_duplicates('movie_id') + + user_profile.set_index("user_id", inplace=True) + + user_item_list = data.groupby("user_id")['movie_id'].apply(list) + + train_set, test_set = gen_data_set(data, negsample) + + train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) + test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) + + # 2.count #unique features for each sparse field and generate feature config for sequence feature + + embedding_dim = 8 + + user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim), + SparseFeat("gender", feature_max_idx['gender'], embedding_dim), + SparseFeat("age", feature_max_idx['age'], embedding_dim), + SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim), + SparseFeat("zip", feature_max_idx['zip'], embedding_dim), + VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim, + embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'), + ] + + item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)] + + # 3.Define Model and train + + user_dnn_hidden_units=(32, 16, 8) + item_dnn_hidden_units=(32, 16, 8) + model = DeepFM(user_feature_columns, item_feature_columns, user_dnn_hidden_units=user_dnn_hidden_units, item_dnn_hidden_units=item_dnn_hidden_units) # FM(user_feature_columns,item_feature_columns) + + model.compile(optimizer='adagrad', loss="binary_crossentropy") + + history = model.fit(train_model_input, train_label, # train_label, + batch_size=256, epochs=1, verbose=1, validation_split=0.0, ) + + # 4. Generate user features for testing and full item features for retrieval + test_user_model_input = test_model_input + all_item_model_input = {"movie_id": item_profile['movie_id'].values} + + user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding) + item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding) + + user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12) + item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12) + + print(user_embs.shape) + print(item_embs.shape) + + # 5. [Optional] ANN search by faiss and evaluate the result + + # test_true_label = {line[0]:[line[2]] for line in test_set} + # + # import numpy as np + # import faiss + # from tqdm import tqdm + # from deepmatch.utils import recall_N + # + # index = faiss.IndexFlatIP(user_dnn_hidden_units[-1] + embedding_dim) + # # faiss.normalize_L2(item_embs) + # index.add(item_embs) + # # faiss.normalize_L2(user_embs) + # D, I = index.search(user_embs, 50) + # s = [] + # hit = 0 + # for i, uid in tqdm(enumerate(test_user_model_input['user_id'])): + # try: + # pred = [item_profile['movie_id'].values[x] for x in I[i]] + # filter_item = None + # recall_score = recall_N(test_true_label[uid], pred, N=50) + # s.append(recall_score) + # if test_true_label[uid] in pred: + # hit += 1 + # except: + # print(i) + # print("recall", np.mean(s)) + # print("hr", hit / len(test_user_model_input['user_id'])) diff --git a/examples/run_dssm_negsampling.py b/examples/run_dssm_negsampling.py index 2ed6d83..2cd9a4f 100644 --- a/examples/run_dssm_negsampling.py +++ b/examples/run_dssm_negsampling.py @@ -53,7 +53,9 @@ # 3.Define Model and train - model = DSSM(user_feature_columns, item_feature_columns) # FM(user_feature_columns,item_feature_columns) + user_dnn_hidden_units=(32, 16, 8) + item_dnn_hidden_units=(32, 16, 8) + model = DSSM(user_feature_columns, item_feature_columns, user_dnn_hidden_units=user_dnn_hidden_units, item_dnn_hidden_units=item_dnn_hidden_units) # FM(user_feature_columns,item_feature_columns) model.compile(optimizer='adagrad', loss="binary_crossentropy") @@ -76,13 +78,13 @@ # 5. [Optional] ANN search by faiss and evaluate the result # test_true_label = {line[0]:[line[2]] for line in test_set} - # + # import numpy as np # import faiss # from tqdm import tqdm # from deepmatch.utils import recall_N - # - # index = faiss.IndexFlatIP(embedding_dim) + + # index = faiss.IndexFlatIP(user_dnn_hidden_units[-1]) # # faiss.normalize_L2(item_embs) # index.add(item_embs) # # faiss.normalize_L2(user_embs)