{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Libs" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import random\n", "import datetime\n", "import numpy as np\n", "\n", "import scipy.sparse as sp\n", "import pandas as pd\n", "\n", "from itertools import islice, cycle\n", "from more_itertools import pairwise\n", "from implicit.nearest_neighbours import TFIDFRecommender\n", "\n", "%matplotlib inline\n", "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "mpl.rc('axes', labelsize=14)\n", "mpl.rc('xtick', labelsize=12)\n", "mpl.rc('ytick', labelsize=12)\n", "\n", "import seaborn as sns\n", "sns.set(style='whitegrid')\n", "sns.set(rc={'figure.figsize':(17, 9)})\n", "\n", "from IPython.core.display import display, HTML, clear_output\n", "display(HTML(''))\n", "display(HTML(''))\n", "display(HTML(''))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Functions" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def calculate_novelty(train_interactions, recommendations, top_n): \n", " users = recommendations['user_id'].unique()\n", " n_users = train_interactions['user_id'].nunique()\n", " n_users_per_item = train_interactions.groupby('item_id')['user_id'].nunique()\n", "\n", " recommendations = recommendations.loc[recommendations['rank'] <= top_n].copy()\n", " recommendations['n_users_per_item'] = recommendations['item_id'].map(n_users_per_item)\n", " recommendations['n_users_per_item'] = recommendations['n_users_per_item'].fillna(1)\n", " recommendations['item_novelty'] = -np.log2(recommendations['n_users_per_item'] / n_users)\n", "\n", " item_novelties = recommendations[['user_id', 'rank', 'item_novelty']]\n", " \n", " miuf_at_k = item_novelties.loc[item_novelties['rank'] <= top_n, ['user_id', 'item_novelty']]\n", " miuf_at_k = miuf_at_k.groupby('user_id').agg('mean').squeeze()\n", "\n", " return miuf_at_k.reindex(users).mean()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def compute_metrics(train, test, recs, top_N):\n", " result = {}\n", " test_recs = test.set_index(['user_id', 'item_id']).join(recs.set_index(['user_id', 'item_id']))\n", " test_recs = test_recs.sort_values(by=['user_id', 'rank'])\n", "\n", " test_recs['users_item_count'] = test_recs.groupby(level='user_id')['rank'].transform(np.size)\n", " test_recs['reciprocal_rank'] = (1 / test_recs['rank']).fillna(0)\n", " test_recs['cumulative_rank'] = test_recs.groupby(level='user_id').cumcount() + 1\n", " test_recs['cumulative_rank'] = test_recs['cumulative_rank'] / test_recs['rank']\n", " \n", " users_count = test_recs.index.get_level_values('user_id').nunique()\n", " \n", " # Uncomment for Precision/Recall at k results\n", "\n", "# for k in range(1, top_N + 1):\n", "# hit_k = f'hit@{k}'\n", "# test_recs[hit_k] = test_recs['rank'] <= k\n", "# result[f'Precision@{k}'] = (test_recs[hit_k] / k).sum() / users_count\n", "# result[f'Recall@{k}'] = (test_recs[hit_k] / test_recs['users_item_count']).sum() / users_count\n", " \n", " result[f'MAP@{top_N}'] = (test_recs['cumulative_rank'] / test_recs['users_item_count']).sum() / users_count\n", " result[f'Novelty@{top_N}'] = calculate_novelty(train, recs, top_N)\n", " \n", " return pd.Series(result)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "class TimeRangeSplit():\n", " \"\"\"\n", " https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.date_range.html\n", " \"\"\"\n", " def __init__(self, \n", " start_date, \n", " end_date=None, \n", " freq='D', \n", " periods=None, \n", " tz=None, \n", " normalize=False, \n", " closed=None, \n", " train_min_date=None,\n", " filter_cold_users=True, \n", " filter_cold_items=True, \n", " filter_already_seen=True):\n", " \n", " self.start_date = start_date\n", " if end_date is None and periods is None:\n", " raise ValueError('Either \"end_date\" or \"periods\" must be non-zero, not both at the same time.')\n", "\n", " self.end_date = end_date\n", " self.freq = freq\n", " self.periods = periods\n", " self.tz = tz\n", " self.normalize = normalize\n", " self.closed = closed\n", " self.train_min_date = pd.to_datetime(train_min_date, errors='raise')\n", " self.filter_cold_users = filter_cold_users\n", " self.filter_cold_items = filter_cold_items\n", " self.filter_already_seen = filter_already_seen\n", "\n", " self.date_range = pd.date_range(\n", " start=start_date, \n", " end=end_date, \n", " freq=freq, \n", " periods=periods, \n", " tz=tz, \n", " normalize=normalize, \n", " closed=closed)\n", "\n", " self.max_n_splits = max(0, len(self.date_range) - 1)\n", " if self.max_n_splits == 0:\n", " raise ValueError('Provided parametrs set an empty date range.') \n", "\n", " def split(self, \n", " df, \n", " user_column='user_id',\n", " item_column='item_id',\n", " datetime_column='date',\n", " fold_stats=False):\n", " df_datetime = df[datetime_column]\n", " if self.train_min_date is not None:\n", " train_min_mask = df_datetime >= self.train_min_date\n", " else:\n", " train_min_mask = df_datetime.notnull()\n", "\n", " date_range = self.date_range[(self.date_range >= df_datetime.min()) & \n", " (self.date_range <= df_datetime.max())]\n", "\n", " for start, end in pairwise(date_range):\n", " fold_info = {\n", " 'Start date': start,\n", " 'End date': end\n", " }\n", " train_mask = train_min_mask & (df_datetime < start)\n", " train_idx = df.index[train_mask]\n", " if fold_stats:\n", " fold_info['Train'] = len(train_idx)\n", "\n", " test_mask = (df_datetime >= start) & (df_datetime < end)\n", " test_idx = df.index[test_mask]\n", " \n", " if self.filter_cold_users:\n", " new = np.setdiff1d(\n", " df.loc[test_idx, user_column].unique(), \n", " df.loc[train_idx, user_column].unique())\n", " new_idx = df.index[test_mask & df[user_column].isin(new)]\n", " test_idx = np.setdiff1d(test_idx, new_idx)\n", " test_mask = df.index.isin(test_idx)\n", " if fold_stats:\n", " fold_info['New users'] = len(new)\n", " fold_info['New users interactions'] = len(new_idx)\n", "\n", " if self.filter_cold_items:\n", " new = np.setdiff1d(\n", " df.loc[test_idx, item_column].unique(), \n", " df.loc[train_idx, item_column].unique())\n", " new_idx = df.index[test_mask & df[item_column].isin(new)]\n", " test_idx = np.setdiff1d(test_idx, new_idx)\n", " test_mask = df.index.isin(test_idx)\n", " if fold_stats:\n", " fold_info['New items'] = len(new)\n", " fold_info['New items interactions'] = len(new_idx)\n", "\n", " if self.filter_already_seen:\n", " user_item = [user_column, item_column]\n", " train_pairs = df.loc[train_idx, user_item].set_index(user_item).index\n", " test_pairs = df.loc[test_idx, user_item].set_index(user_item).index\n", " intersection = train_pairs.intersection(test_pairs)\n", " print(f'Already seen number: {len(intersection)}')\n", " test_idx = test_idx[~test_pairs.isin(intersection)]\n", " # test_mask = rd.df.index.isin(test_idx)\n", " if fold_stats:\n", " fold_info['Known interactions'] = len(intersection)\n", "\n", " if fold_stats:\n", " fold_info['Test'] = len(test_idx)\n", "\n", " yield (train_idx, test_idx, fold_info)\n", "\n", " def get_n_splits(self, df, datetime_column='date'):\n", " df_datetime = df[datetime_column]\n", " if self.train_min_date is not None:\n", " df_datetime = df_datetime[df_datetime >= self.train_min_date]\n", "\n", " date_range = self.date_range[(self.date_range >= df_datetime.min()) & \n", " (self.date_range <= df_datetime.max())]\n", "\n", " return max(0, len(date_range) - 1)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def get_coo_matrix(df, \n", " user_col='user_id', \n", " item_col='item_id', \n", " weight_col=None, \n", " users_mapping={}, \n", " items_mapping={}):\n", " \n", " if weight_col is None:\n", " weights = np.ones(len(df), dtype=np.float32)\n", " else:\n", " weights = df[weight_col].astype(np.float32)\n", "\n", " interaction_matrix = sp.coo_matrix((\n", " weights, \n", " (\n", " df[user_col].map(users_mapping.get), \n", " df[item_col].map(items_mapping.get)\n", " )\n", " ))\n", " return interaction_matrix" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def generate_implicit_recs_mapper(\n", " model,\n", " train_matrix,\n", " top_N,\n", " user_mapping,\n", " item_inv_mapping,\n", " filter_already_liked_items\n", "):\n", " def _recs_mapper(user):\n", " user_id = user_mapping[user]\n", " recs = model.recommend(user_id, \n", " train_matrix, \n", " N=top_N, \n", " filter_already_liked_items=filter_already_liked_items)\n", " return [item_inv_mapping[item] for item, _ in recs]\n", " return _recs_mapper" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting data" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "users_df = pd.read_csv('users_processed.csv',)\n", "items_df = pd.read_csv('items_processed.csv',)\n", "interactions_df = pd.read_csv('interactions_processed.csv', parse_dates=['last_watch_dt'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Baseline - популярное" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "class PopularRecommender():\n", " def __init__(self, max_K=10, days=30, item_column='item_id', dt_column='date'):\n", " self.max_K = max_K\n", " self.days = days\n", " self.item_column = item_column\n", " self.dt_column = dt_column\n", " self.recommendations = []\n", " \n", " def fit(self, df, ):\n", " min_date = df[self.dt_column].max().normalize() - pd.DateOffset(days=self.days)\n", " self.recommendations = df.loc[df[self.dt_column] > min_date, self.item_column].value_counts().head(self.max_K).index.values\n", " \n", " def recommend(self, users=None, N=10):\n", " recs = self.recommendations[:N]\n", " if users is None:\n", " return recs\n", " else:\n", " return list(islice(cycle([recs]), len(users)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Пример на одном фолде" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "test = interactions_df[interactions_df['last_watch_dt'] == interactions_df['last_watch_dt'].max()]\n", "train = interactions_df[interactions_df['last_watch_dt'] < interactions_df['last_watch_dt'].max()]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "pop_model = PopularRecommender(days=7, dt_column='last_watch_dt')\n", "pop_model.fit(train)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 9728, 15297, 10440, 13865, 12360, 14488, 12192, 341, 512,\n", " 4151])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top10_recs = pop_model.recommend()\n", "top10_recs" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "item_titles = pd.Series(items_df['title'].values, index=items_df['item_id']).to_dict()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['гнев человеческий',\n", " 'клиника счастья',\n", " 'хрустальный',\n", " 'девятаев',\n", " 'круэлла',\n", " 'мастер меча',\n", " 'фемида видит',\n", " 'лето - это море',\n", " 'рядовой чээрин',\n", " 'секреты семейной жизни']" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(map(item_titles.get, top10_recs))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_iditem_id
0203219[9728, 15297, 10440, 13865, 12360, 14488, 1219...
1125519[9728, 15297, 10440, 13865, 12360, 14488, 1219...
2626036[9728, 15297, 10440, 13865, 12360, 14488, 1219...
31029980[9728, 15297, 10440, 13865, 12360, 14488, 1219...
4830261[9728, 15297, 10440, 13865, 12360, 14488, 1219...
\n", "
" ], "text/plain": [ " user_id item_id\n", "0 203219 [9728, 15297, 10440, 13865, 12360, 14488, 1219...\n", "1 125519 [9728, 15297, 10440, 13865, 12360, 14488, 1219...\n", "2 626036 [9728, 15297, 10440, 13865, 12360, 14488, 1219...\n", "3 1029980 [9728, 15297, 10440, 13865, 12360, 14488, 1219...\n", "4 830261 [9728, 15297, 10440, 13865, 12360, 14488, 1219..." ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "recs = pd.DataFrame({'user_id': test['user_id'].unique()})\n", "top_N = 10\n", "recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)\n", "recs.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "recs = recs.explode('item_id')" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_iditem_idrank
020321997281
0203219152972
0203219104403
0203219138654
0203219123605
0203219144886
0203219121927
02032193418
02032195129
0203219415110
112551997281
1125519152972
\n", "
" ], "text/plain": [ " user_id item_id rank\n", "0 203219 9728 1\n", "0 203219 15297 2\n", "0 203219 10440 3\n", "0 203219 13865 4\n", "0 203219 12360 5\n", "0 203219 14488 6\n", "0 203219 12192 7\n", "0 203219 341 8\n", "0 203219 512 9\n", "0 203219 4151 10\n", "1 125519 9728 1\n", "1 125519 15297 2" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "recs['rank'] = recs.groupby('user_id').cumcount() + 1\n", "recs.head(top_N + 2)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MAP@10 0.089383\n", "Novelty@10 4.528709\n", "dtype: float64" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "compute_metrics(train, test, recs, 10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Валидация на фолдах\n", "\n", "Возьмем 3 последние недели из наших данных, и будем тестировать на них последовательно (1 test fold - 1 неделя).\n", "\n", "Не забывайте про проблему холодного старта." ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(Timestamp('2021-08-01 00:00:00'), Timestamp('2021-08-22 00:00:00'))" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "last_date = interactions_df['last_watch_dt'].max().normalize()\n", "folds = 3\n", "start_date = last_date - pd.Timedelta(days=folds*7)\n", "start_date, last_date" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(3, 3)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cv = TimeRangeSplit(start_date=start_date, periods=folds+1, freq='W')\n", "\n", "cv.max_n_splits, cv.get_n_splits(interactions_df, datetime_column='last_watch_dt')" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatetimeIndex(['2021-08-01', '2021-08-08', '2021-08-15', '2021-08-22'], dtype='datetime64[ns]', freq='W-SUN')" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cv.date_range" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Already seen number: 0\n", "Already seen number: 0\n", "Already seen number: 0\n" ] } ], "source": [ "folds_with_stats = list(cv.split(\n", " interactions_df, \n", " user_column='user_id',\n", " item_column='item_id',\n", " datetime_column='last_watch_dt',\n", " fold_stats=True\n", "))\n", "\n", "folds_info_with_stats = pd.DataFrame([info for _, _, info in folds_with_stats])" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Start dateEnd dateTrainNew usersNew users interactionsNew itemsNew items interactionsKnown interactionsTest
02021-08-012021-08-0842038855340811276417470200264039
12021-08-082021-08-1545877085466211158015292820276699
22021-08-152021-08-2249852695601411662911459540297228
\n", "
" ], "text/plain": [ " Start date End date Train New users New users interactions \\\n", "0 2021-08-01 2021-08-08 4203885 53408 112764 \n", "1 2021-08-08 2021-08-15 4587708 54662 111580 \n", "2 2021-08-15 2021-08-22 4985269 56014 116629 \n", "\n", " New items New items interactions Known interactions Test \n", "0 174 7020 0 264039 \n", "1 152 9282 0 276699 \n", "2 114 5954 0 297228 " ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "folds_info_with_stats" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Популярное на фолдах" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "top_N = 10\n", "last_n_days = 7" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "scrolled": true }, "outputs": [], "source": [ "final_results = []\n", "validation_results = pd.DataFrame()\n", "\n", "for train_idx, test_idx, info in folds_with_stats:\n", " train = interactions_df.loc[train_idx]\n", " test = interactions_df.loc[test_idx]\n", " \n", " pop_model = PopularRecommender(days=last_n_days, dt_column='last_watch_dt')\n", " pop_model.fit(train)\n", "\n", " recs = pd.DataFrame({'user_id': test['user_id'].unique()})\n", " recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)\n", " recs = recs.explode('item_id')\n", " recs['rank'] = recs.groupby('user_id').cumcount() + 1\n", "\n", " fold_result = compute_metrics(train, test, recs, top_N)\n", "\n", " validation_results = validation_results.append(fold_result, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MAP@10 0.075772\n", "Novelty@10 4.175400\n", "dtype: float64" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "validation_results.agg({'MAP@10':'mean', 'Novelty@10':'mean'})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Соцдем популярное " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Посмотрим, имеет ли смысл предсказывать популярное в зависимости от соц.группы" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "train_idx, test_idx, info = folds_with_stats[0]\n", "train = interactions_df.loc[train_idx]\n", "test = interactions_df.loc[test_idx]\n", "date_window_for_popular = train['last_watch_dt'].max() - pd.DateOffset(days=last_n_days)\n", "train_slice = pd.merge(train[train['last_watch_dt'] >= date_window_for_popular], users_df, on='user_id', how='left')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Как мы помним из предыдущего ноутбука, у нас есть пользователи без фичей, поэтому для них надо определить заполнение " ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_iditem_idlast_watch_dttotal_durwatched_pctageincomesexkids_flg
079146681992021-07-277139age_18_24income_20_40FFalse
18178626162021-07-244142290age_35_44income_20_40FTrue
2161176104402021-07-29220age_25_34income_0_20FFalse
351390236142021-07-2411645NaNNaNNaNNaN
4568405152972021-07-3015298100age_18_24income_40_60FFalse
\n", "
" ], "text/plain": [ " user_id item_id last_watch_dt total_dur watched_pct age \\\n", "0 791466 8199 2021-07-27 713 9 age_18_24 \n", "1 81786 2616 2021-07-24 41422 90 age_35_44 \n", "2 161176 10440 2021-07-29 22 0 age_25_34 \n", "3 513902 3614 2021-07-24 1164 5 NaN \n", "4 568405 15297 2021-07-30 15298 100 age_18_24 \n", "\n", " income sex kids_flg \n", "0 income_20_40 F False \n", "1 income_20_40 F True \n", "2 income_0_20 F False \n", "3 NaN NaN NaN \n", "4 income_40_60 F False " ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_slice.head()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "train_slice.fillna({'age':'age_unknown',\n", " 'sex':'sex_unknown',\n", " 'income': 'income_unknown',\n", " 'kids_flg': False\n", " }, inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Например, можно смотреть популярное в разрезе возраста, пола и наличия детей" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_iditem_idlast_watch_dttotal_durwatched_pctageincomesexkids_flg
079146681992021-07-277139age_18_24income_20_40FFalse
18178626162021-07-244142290age_35_44income_20_40FTrue
2161176104402021-07-29220age_25_34income_0_20FFalse
351390236142021-07-2411645age_unknownincome_unknownsex_unknownFalse
4568405152972021-07-3015298100age_18_24income_40_60FFalse
\n", "
" ], "text/plain": [ " user_id item_id last_watch_dt total_dur watched_pct age \\\n", "0 791466 8199 2021-07-27 713 9 age_18_24 \n", "1 81786 2616 2021-07-24 41422 90 age_35_44 \n", "2 161176 10440 2021-07-29 22 0 age_25_34 \n", "3 513902 3614 2021-07-24 1164 5 age_unknown \n", "4 568405 15297 2021-07-30 15298 100 age_18_24 \n", "\n", " income sex kids_flg \n", "0 income_20_40 F False \n", "1 income_20_40 F True \n", "2 income_0_20 F False \n", "3 income_unknown sex_unknown False \n", "4 income_40_60 F False " ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_slice.head()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "soc_dem_recommendations = train_slice.groupby(\n", " ['age', 'sex', 'income', 'item_id']\n", ").size().to_frame().reset_index()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agesexincomeitem_id0
0age_18_24Fincome_0_20147
1age_18_24Fincome_0_20241
2age_18_24Fincome_0_20281
3age_18_24Fincome_0_20851
4age_18_24Fincome_0_20981
..................
74358age_unknownsex_unknownincome_unknown1649932
74359age_unknownsex_unknownincome_unknown165052
74360age_unknownsex_unknownincome_unknown165061
74361age_unknownsex_unknownincome_unknown16509199
74362age_unknownsex_unknownincome_unknown165164
\n", "

74363 rows × 5 columns

\n", "
" ], "text/plain": [ " age sex income item_id 0\n", "0 age_18_24 F income_0_20 14 7\n", "1 age_18_24 F income_0_20 24 1\n", "2 age_18_24 F income_0_20 28 1\n", "3 age_18_24 F income_0_20 85 1\n", "4 age_18_24 F income_0_20 98 1\n", "... ... ... ... ... ...\n", "74358 age_unknown sex_unknown income_unknown 16499 32\n", "74359 age_unknown sex_unknown income_unknown 16505 2\n", "74360 age_unknown sex_unknown income_unknown 16506 1\n", "74361 age_unknown sex_unknown income_unknown 16509 199\n", "74362 age_unknown sex_unknown income_unknown 16516 4\n", "\n", "[74363 rows x 5 columns]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soc_dem_recommendations" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Теперь надо просто для каждого пользователя выбрать самое популярные top_n объектов в его группе" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Можем проверить этот вариант на фолдах" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "validation_results = pd.DataFrame()\n", "\n", "for train_idx, test_idx, info in folds_with_stats:\n", " train = interactions_df.loc[train_idx]\n", " test = interactions_df.loc[test_idx]\n", " date_window = train['last_watch_dt'].max() - pd.DateOffset(days=last_n_days)\n", " train_slice = pd.merge(train[train['last_watch_dt'] >= date_window], users_df, on='user_id', how='left')\n", " \n", " train_slice.fillna({\n", " 'age':'age_unknown',\n", " 'sex':'sex_unknown',\n", " 'income': 'income_unknown',\n", " 'kids_flg': False\n", " },inplace=True)\n", " \n", " soc_dem_recommendations = train_slice.groupby(\n", " ['age', 'sex', 'income', 'item_id']\n", " ).size().to_frame().reset_index()\n", " \n", " top_soc_dem = []\n", "\n", " for age in soc_dem_recommendations.age.unique():\n", " for income in soc_dem_recommendations.income.unique():\n", " for sex in soc_dem_recommendations.sex.unique():\n", " top_items = soc_dem_recommendations[\n", " (soc_dem_recommendations.age == age)\n", " & (soc_dem_recommendations.income == income)\n", " & (soc_dem_recommendations.sex == sex)].sort_values(0, ascending=False).head(10).item_id.values\n", " top_soc_dem.append([age, income, sex, top_items])\n", "\n", " top_soc_dem = pd.DataFrame(top_soc_dem, columns = ['age', 'income', 'sex', 'item_id'])\n", " \n", " recs = pd.DataFrame({'user_id': test['user_id'].unique()})\n", " recs = pd.merge(recs[['user_id']], users_df, on='user_id', how='left')\n", " recs.fillna({\n", " 'age':'age_unknown',\n", " 'sex':'sex_unknown',\n", " 'income': 'income_unknown',\n", " 'kids_flg': False\n", " }, inplace=True)\n", " \n", " recs = pd.merge(recs, top_soc_dem, on = ['age', 'sex', 'income'], how = 'left')\n", " recs = recs.drop(columns = ['age', 'sex', 'income'])\n", " \n", " recs = recs.explode('item_id')\n", " recs['rank'] = recs.groupby('user_id').cumcount() + 1\n", " fold_result = compute_metrics(train, test, recs, top_N)\n", " \n", " validation_results = validation_results.append(fold_result, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MAP@10 0.080486\n", "Novelty@10 4.237977\n", "dtype: float64" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "validation_results.agg({'MAP@10':'mean', 'Novelty@10':'mean'})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "В данном случае признаки, по которым вы строите популярное, подбираются, также, как и кол-во дней, которое вы берете для расчета популярного " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Tfidf" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "users_inv_mapping = dict(enumerate(interactions_df['user_id'].unique()))\n", "users_mapping = {v: k for k, v in users_inv_mapping.items()}\n", "\n", "items_inv_mapping = dict(enumerate(interactions_df['item_id'].unique()))\n", "items_mapping = {v: k for k, v in items_inv_mapping.items()}" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "validation_results = pd.DataFrame()\n", "\n", "for train_idx, test_idx, info in folds_with_stats:\n", " train = interactions_df.loc[train_idx]\n", "\n", " date_window = train['last_watch_dt'].max() - pd.DateOffset(days=60)\n", " train = train[train['last_watch_dt'] >= date_window]\n", "\n", " test = interactions_df.loc[test_idx]\n", "\n", " train_mat = get_coo_matrix(\n", " train,\n", " users_mapping=users_mapping,\n", " items_mapping=items_mapping,\n", " ).tocsr()\n", "\n", " model = TFIDFRecommender(K=top_N)\n", " model.fit(train_mat.T, show_progress=False) \n", "\n", " mapper = generate_implicit_recs_mapper( \n", " model,\n", " train_mat,\n", " top_N,\n", " users_mapping,\n", " items_inv_mapping,\n", " filter_already_liked_items=True\n", " )\n", "\n", " recs = pd.DataFrame({'user_id': test['user_id'].unique()})\n", " recs['item_id'] = recs['user_id'].map(mapper)\n", " recs = recs.explode('item_id')\n", " recs['rank'] = recs.groupby('user_id').cumcount() + 1\n", " fold_result = compute_metrics(train, test, recs, top_N)\n", "\n", " validation_results = validation_results.append(fold_result, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MAP@10 0.084463\n", "Novelty@10 6.805098\n", "dtype: float64" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "validation_results.agg({'MAP@10':'mean', 'Novelty@10':'mean',})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Просто использовать код выше для submission не получится из-за холодных пользователей. Придется придумать, как их обработать." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Пример submission" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "submission = pd.read_csv('sample_submission.csv')" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "train = interactions_df\n", "test = submission\n", "\n", "pop_model = PopularRecommender(days=last_n_days, dt_column='last_watch_dt')\n", "pop_model.fit(train)\n", "\n", "recs = pd.DataFrame({'user_id': test['user_id'].unique()})\n", "recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)\n", "recs = recs.explode('item_id')\n", "recs['rank'] = recs.groupby('user_id').cumcount() + 1\n", "recs = recs.groupby('user_id').agg({'item_id': list}).reset_index()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_iditem_id
03[9728, 15297, 10440, 14488, 13865, 12192, 341,...
111[9728, 15297, 10440, 14488, 13865, 12192, 341,...
229[9728, 15297, 10440, 14488, 13865, 12192, 341,...
330[9728, 15297, 10440, 14488, 13865, 12192, 341,...
433[9728, 15297, 10440, 14488, 13865, 12192, 341,...
\n", "
" ], "text/plain": [ " user_id item_id\n", "0 3 [9728, 15297, 10440, 14488, 13865, 12192, 341,...\n", "1 11 [9728, 15297, 10440, 14488, 13865, 12192, 341,...\n", "2 29 [9728, 15297, 10440, 14488, 13865, 12192, 341,...\n", "3 30 [9728, 15297, 10440, 14488, 13865, 12192, 341,...\n", "4 33 [9728, 15297, 10440, 14488, 13865, 12192, 341,..." ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "recs.head()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "recs.to_csv('sample_submission.csv', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "churn", "language": "python", "name": "churn" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 4 }