{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Libs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>.container { width:80% !important; }</style>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<style>.prompt { min-width:10ex !important; }</style>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<style>div#notebook { font-size:12px !important; }</style>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import random\n",
    "import datetime\n",
    "import numpy as np\n",
    "\n",
    "import scipy.sparse as sp\n",
    "import pandas as pd\n",
    "\n",
    "from itertools import islice, cycle\n",
    "from more_itertools import pairwise\n",
    "from implicit.nearest_neighbours import TFIDFRecommender\n",
    "\n",
    "%matplotlib inline\n",
    "import matplotlib as mpl\n",
    "import matplotlib.pyplot as plt\n",
    "mpl.rc('axes', labelsize=14)\n",
    "mpl.rc('xtick', labelsize=12)\n",
    "mpl.rc('ytick', labelsize=12)\n",
    "\n",
    "import seaborn as sns\n",
    "sns.set(style='whitegrid')\n",
    "sns.set(rc={'figure.figsize':(17, 9)})\n",
    "\n",
    "from IPython.core.display import display, HTML, clear_output\n",
    "display(HTML('<style>.container { width:80% !important; }</style>'))\n",
    "display(HTML('<style>.prompt { min-width:10ex !important; }</style>'))\n",
    "display(HTML('<style>div#notebook { font-size:12px !important; }</style>'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_novelty(train_interactions, recommendations, top_n): \n",
    "    users = recommendations['user_id'].unique()\n",
    "    n_users = train_interactions['user_id'].nunique()\n",
    "    n_users_per_item = train_interactions.groupby('item_id')['user_id'].nunique()\n",
    "\n",
    "    recommendations = recommendations.loc[recommendations['rank'] <= top_n].copy()\n",
    "    recommendations['n_users_per_item'] = recommendations['item_id'].map(n_users_per_item)\n",
    "    recommendations['n_users_per_item'] = recommendations['n_users_per_item'].fillna(1)\n",
    "    recommendations['item_novelty'] = -np.log2(recommendations['n_users_per_item'] / n_users)\n",
    "\n",
    "    item_novelties = recommendations[['user_id', 'rank', 'item_novelty']]\n",
    "    \n",
    "    miuf_at_k = item_novelties.loc[item_novelties['rank'] <= top_n, ['user_id', 'item_novelty']]\n",
    "    miuf_at_k = miuf_at_k.groupby('user_id').agg('mean').squeeze()\n",
    "\n",
    "    return miuf_at_k.reindex(users).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_metrics(train, test, recs, top_N):\n",
    "    result = {}\n",
    "    test_recs = test.set_index(['user_id', 'item_id']).join(recs.set_index(['user_id', 'item_id']))\n",
    "    test_recs = test_recs.sort_values(by=['user_id', 'rank'])\n",
    "\n",
    "    test_recs['users_item_count'] = test_recs.groupby(level='user_id')['rank'].transform(np.size)\n",
    "    test_recs['reciprocal_rank'] = (1 / test_recs['rank']).fillna(0)\n",
    "    test_recs['cumulative_rank'] = test_recs.groupby(level='user_id').cumcount() + 1\n",
    "    test_recs['cumulative_rank'] = test_recs['cumulative_rank'] / test_recs['rank']\n",
    "    \n",
    "    users_count = test_recs.index.get_level_values('user_id').nunique()\n",
    "    \n",
    "    # Uncomment for Precision/Recall at k results\n",
    "\n",
    "#     for k in range(1, top_N + 1):\n",
    "#         hit_k = f'hit@{k}'\n",
    "#         test_recs[hit_k] = test_recs['rank'] <= k\n",
    "#         result[f'Precision@{k}'] = (test_recs[hit_k] / k).sum() / users_count\n",
    "#         result[f'Recall@{k}'] = (test_recs[hit_k] / test_recs['users_item_count']).sum() / users_count\n",
    "        \n",
    "    result[f'MAP@{top_N}'] = (test_recs['cumulative_rank'] / test_recs['users_item_count']).sum() / users_count\n",
    "    result[f'Novelty@{top_N}'] = calculate_novelty(train, recs, top_N)\n",
    "    \n",
    "    return pd.Series(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "class TimeRangeSplit():\n",
    "    \"\"\"\n",
    "        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.date_range.html\n",
    "    \"\"\"\n",
    "    def __init__(self, \n",
    "                 start_date, \n",
    "                 end_date=None, \n",
    "                 freq='D', \n",
    "                 periods=None, \n",
    "                 tz=None, \n",
    "                 normalize=False, \n",
    "                 closed=None, \n",
    "                 train_min_date=None,\n",
    "                 filter_cold_users=True, \n",
    "                 filter_cold_items=True, \n",
    "                 filter_already_seen=True):\n",
    "        \n",
    "        self.start_date = start_date\n",
    "        if end_date is None and periods is None:\n",
    "            raise ValueError('Either \"end_date\" or \"periods\" must be non-zero, not both at the same time.')\n",
    "\n",
    "        self.end_date = end_date\n",
    "        self.freq = freq\n",
    "        self.periods = periods\n",
    "        self.tz = tz\n",
    "        self.normalize = normalize\n",
    "        self.closed = closed\n",
    "        self.train_min_date = pd.to_datetime(train_min_date, errors='raise')\n",
    "        self.filter_cold_users = filter_cold_users\n",
    "        self.filter_cold_items = filter_cold_items\n",
    "        self.filter_already_seen = filter_already_seen\n",
    "\n",
    "        self.date_range = pd.date_range(\n",
    "            start=start_date, \n",
    "            end=end_date, \n",
    "            freq=freq, \n",
    "            periods=periods, \n",
    "            tz=tz, \n",
    "            normalize=normalize, \n",
    "            closed=closed)\n",
    "\n",
    "        self.max_n_splits = max(0, len(self.date_range) - 1)\n",
    "        if self.max_n_splits == 0:\n",
    "            raise ValueError('Provided parametrs set an empty date range.') \n",
    "\n",
    "    def split(self, \n",
    "              df, \n",
    "              user_column='user_id',\n",
    "              item_column='item_id',\n",
    "              datetime_column='date',\n",
    "              fold_stats=False):\n",
    "        df_datetime = df[datetime_column]\n",
    "        if self.train_min_date is not None:\n",
    "            train_min_mask = df_datetime >= self.train_min_date\n",
    "        else:\n",
    "            train_min_mask = df_datetime.notnull()\n",
    "\n",
    "        date_range = self.date_range[(self.date_range >= df_datetime.min()) & \n",
    "                                     (self.date_range <= df_datetime.max())]\n",
    "\n",
    "        for start, end in pairwise(date_range):\n",
    "            fold_info = {\n",
    "                'Start date': start,\n",
    "                'End date': end\n",
    "            }\n",
    "            train_mask = train_min_mask & (df_datetime < start)\n",
    "            train_idx = df.index[train_mask]\n",
    "            if fold_stats:\n",
    "                fold_info['Train'] = len(train_idx)\n",
    "\n",
    "            test_mask = (df_datetime >= start) & (df_datetime < end)\n",
    "            test_idx = df.index[test_mask]\n",
    "            \n",
    "            if self.filter_cold_users:\n",
    "                new = np.setdiff1d(\n",
    "                    df.loc[test_idx, user_column].unique(), \n",
    "                    df.loc[train_idx, user_column].unique())\n",
    "                new_idx = df.index[test_mask & df[user_column].isin(new)]\n",
    "                test_idx = np.setdiff1d(test_idx, new_idx)\n",
    "                test_mask = df.index.isin(test_idx)\n",
    "                if fold_stats:\n",
    "                    fold_info['New users'] = len(new)\n",
    "                    fold_info['New users interactions'] = len(new_idx)\n",
    "\n",
    "            if self.filter_cold_items:\n",
    "                new = np.setdiff1d(\n",
    "                    df.loc[test_idx, item_column].unique(), \n",
    "                    df.loc[train_idx, item_column].unique())\n",
    "                new_idx = df.index[test_mask & df[item_column].isin(new)]\n",
    "                test_idx = np.setdiff1d(test_idx, new_idx)\n",
    "                test_mask = df.index.isin(test_idx)\n",
    "                if fold_stats:\n",
    "                    fold_info['New items'] = len(new)\n",
    "                    fold_info['New items interactions'] = len(new_idx)\n",
    "\n",
    "            if self.filter_already_seen:\n",
    "                user_item = [user_column, item_column]\n",
    "                train_pairs = df.loc[train_idx, user_item].set_index(user_item).index\n",
    "                test_pairs = df.loc[test_idx, user_item].set_index(user_item).index\n",
    "                intersection = train_pairs.intersection(test_pairs)\n",
    "                print(f'Already seen number: {len(intersection)}')\n",
    "                test_idx = test_idx[~test_pairs.isin(intersection)]\n",
    "                # test_mask = rd.df.index.isin(test_idx)\n",
    "                if fold_stats:\n",
    "                    fold_info['Known interactions'] = len(intersection)\n",
    "\n",
    "            if fold_stats:\n",
    "                fold_info['Test'] = len(test_idx)\n",
    "\n",
    "            yield (train_idx, test_idx, fold_info)\n",
    "\n",
    "    def get_n_splits(self, df, datetime_column='date'):\n",
    "        df_datetime = df[datetime_column]\n",
    "        if self.train_min_date is not None:\n",
    "            df_datetime = df_datetime[df_datetime >= self.train_min_date]\n",
    "\n",
    "        date_range = self.date_range[(self.date_range >= df_datetime.min()) & \n",
    "                                     (self.date_range <= df_datetime.max())]\n",
    "\n",
    "        return max(0, len(date_range) - 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_coo_matrix(df, \n",
    "                   user_col='user_id', \n",
    "                   item_col='item_id', \n",
    "                   weight_col=None, \n",
    "                   users_mapping={}, \n",
    "                   items_mapping={}):\n",
    "    \n",
    "    if weight_col is None:\n",
    "        weights = np.ones(len(df), dtype=np.float32)\n",
    "    else:\n",
    "        weights = df[weight_col].astype(np.float32)\n",
    "\n",
    "    interaction_matrix = sp.coo_matrix((\n",
    "        weights, \n",
    "        (\n",
    "            df[user_col].map(users_mapping.get), \n",
    "            df[item_col].map(items_mapping.get)\n",
    "        )\n",
    "    ))\n",
    "    return interaction_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_implicit_recs_mapper(\n",
    "    model,\n",
    "    train_matrix,\n",
    "    top_N,\n",
    "    user_mapping,\n",
    "    item_inv_mapping,\n",
    "    filter_already_liked_items\n",
    "):\n",
    "    def _recs_mapper(user):\n",
    "        user_id = user_mapping[user]\n",
    "        recs = model.recommend(user_id, \n",
    "                               train_matrix, \n",
    "                               N=top_N, \n",
    "                               filter_already_liked_items=filter_already_liked_items)\n",
    "        return [item_inv_mapping[item] for item, _ in recs]\n",
    "    return _recs_mapper"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Getting data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "users_df = pd.read_csv('users_processed.csv',)\n",
    "items_df = pd.read_csv('items_processed.csv',)\n",
    "interactions_df = pd.read_csv('interactions_processed.csv', parse_dates=['last_watch_dt'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Baseline - популярное"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "class PopularRecommender():\n",
    "    def __init__(self, max_K=10, days=30, item_column='item_id', dt_column='date'):\n",
    "        self.max_K = max_K\n",
    "        self.days = days\n",
    "        self.item_column = item_column\n",
    "        self.dt_column = dt_column\n",
    "        self.recommendations = []\n",
    "        \n",
    "    def fit(self, df, ):\n",
    "        min_date = df[self.dt_column].max().normalize() - pd.DateOffset(days=self.days)\n",
    "        self.recommendations = df.loc[df[self.dt_column] > min_date, self.item_column].value_counts().head(self.max_K).index.values\n",
    "    \n",
    "    def recommend(self, users=None, N=10):\n",
    "        recs = self.recommendations[:N]\n",
    "        if users is None:\n",
    "            return recs\n",
    "        else:\n",
    "            return list(islice(cycle([recs]), len(users)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Пример на одном фолде"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "test = interactions_df[interactions_df['last_watch_dt'] == interactions_df['last_watch_dt'].max()]\n",
    "train = interactions_df[interactions_df['last_watch_dt'] < interactions_df['last_watch_dt'].max()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "pop_model = PopularRecommender(days=7, dt_column='last_watch_dt')\n",
    "pop_model.fit(train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 9728, 15297, 10440, 13865, 12360, 14488, 12192,   341,   512,\n",
       "        4151])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "top10_recs = pop_model.recommend()\n",
    "top10_recs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "item_titles = pd.Series(items_df['title'].values, index=items_df['item_id']).to_dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['гнев человеческий',\n",
       " 'клиника счастья',\n",
       " 'хрустальный',\n",
       " 'девятаев',\n",
       " 'круэлла',\n",
       " 'мастер меча',\n",
       " 'фемида видит',\n",
       " 'лето - это море',\n",
       " 'рядовой чээрин',\n",
       " 'секреты семейной жизни']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(map(item_titles.get, top10_recs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>item_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>203219</td>\n",
       "      <td>[9728, 15297, 10440, 13865, 12360, 14488, 1219...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>125519</td>\n",
       "      <td>[9728, 15297, 10440, 13865, 12360, 14488, 1219...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>626036</td>\n",
       "      <td>[9728, 15297, 10440, 13865, 12360, 14488, 1219...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1029980</td>\n",
       "      <td>[9728, 15297, 10440, 13865, 12360, 14488, 1219...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>830261</td>\n",
       "      <td>[9728, 15297, 10440, 13865, 12360, 14488, 1219...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id                                            item_id\n",
       "0   203219  [9728, 15297, 10440, 13865, 12360, 14488, 1219...\n",
       "1   125519  [9728, 15297, 10440, 13865, 12360, 14488, 1219...\n",
       "2   626036  [9728, 15297, 10440, 13865, 12360, 14488, 1219...\n",
       "3  1029980  [9728, 15297, 10440, 13865, 12360, 14488, 1219...\n",
       "4   830261  [9728, 15297, 10440, 13865, 12360, 14488, 1219..."
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "recs = pd.DataFrame({'user_id': test['user_id'].unique()})\n",
    "top_N = 10\n",
    "recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)\n",
    "recs.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "recs = recs.explode('item_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>item_id</th>\n",
       "      <th>rank</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>203219</td>\n",
       "      <td>9728</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>203219</td>\n",
       "      <td>15297</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>203219</td>\n",
       "      <td>10440</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>203219</td>\n",
       "      <td>13865</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>203219</td>\n",
       "      <td>12360</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>203219</td>\n",
       "      <td>14488</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>203219</td>\n",
       "      <td>12192</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>203219</td>\n",
       "      <td>341</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>203219</td>\n",
       "      <td>512</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>203219</td>\n",
       "      <td>4151</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>125519</td>\n",
       "      <td>9728</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>125519</td>\n",
       "      <td>15297</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id item_id  rank\n",
       "0   203219    9728     1\n",
       "0   203219   15297     2\n",
       "0   203219   10440     3\n",
       "0   203219   13865     4\n",
       "0   203219   12360     5\n",
       "0   203219   14488     6\n",
       "0   203219   12192     7\n",
       "0   203219     341     8\n",
       "0   203219     512     9\n",
       "0   203219    4151    10\n",
       "1   125519    9728     1\n",
       "1   125519   15297     2"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "recs['rank'] = recs.groupby('user_id').cumcount() + 1\n",
    "recs.head(top_N + 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MAP@10        0.089383\n",
       "Novelty@10    4.528709\n",
       "dtype: float64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "compute_metrics(train, test, recs, 10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Валидация на фолдах\n",
    "\n",
    "Возьмем 3 последние недели из наших данных, и будем тестировать на них последовательно (1 test fold - 1 неделя).\n",
    "\n",
    "Не забывайте про проблему холодного старта."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(Timestamp('2021-08-01 00:00:00'), Timestamp('2021-08-22 00:00:00'))"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "last_date = interactions_df['last_watch_dt'].max().normalize()\n",
    "folds = 3\n",
    "start_date = last_date - pd.Timedelta(days=folds*7)\n",
    "start_date, last_date"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3, 3)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv = TimeRangeSplit(start_date=start_date, periods=folds+1, freq='W')\n",
    "\n",
    "cv.max_n_splits, cv.get_n_splits(interactions_df, datetime_column='last_watch_dt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatetimeIndex(['2021-08-01', '2021-08-08', '2021-08-15', '2021-08-22'], dtype='datetime64[ns]', freq='W-SUN')"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv.date_range"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Already seen number: 0\n",
      "Already seen number: 0\n",
      "Already seen number: 0\n"
     ]
    }
   ],
   "source": [
    "folds_with_stats = list(cv.split(\n",
    "    interactions_df, \n",
    "    user_column='user_id',\n",
    "    item_column='item_id',\n",
    "    datetime_column='last_watch_dt',\n",
    "    fold_stats=True\n",
    "))\n",
    "\n",
    "folds_info_with_stats = pd.DataFrame([info for _, _, info in folds_with_stats])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Start date</th>\n",
       "      <th>End date</th>\n",
       "      <th>Train</th>\n",
       "      <th>New users</th>\n",
       "      <th>New users interactions</th>\n",
       "      <th>New items</th>\n",
       "      <th>New items interactions</th>\n",
       "      <th>Known interactions</th>\n",
       "      <th>Test</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2021-08-01</td>\n",
       "      <td>2021-08-08</td>\n",
       "      <td>4203885</td>\n",
       "      <td>53408</td>\n",
       "      <td>112764</td>\n",
       "      <td>174</td>\n",
       "      <td>7020</td>\n",
       "      <td>0</td>\n",
       "      <td>264039</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2021-08-08</td>\n",
       "      <td>2021-08-15</td>\n",
       "      <td>4587708</td>\n",
       "      <td>54662</td>\n",
       "      <td>111580</td>\n",
       "      <td>152</td>\n",
       "      <td>9282</td>\n",
       "      <td>0</td>\n",
       "      <td>276699</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2021-08-15</td>\n",
       "      <td>2021-08-22</td>\n",
       "      <td>4985269</td>\n",
       "      <td>56014</td>\n",
       "      <td>116629</td>\n",
       "      <td>114</td>\n",
       "      <td>5954</td>\n",
       "      <td>0</td>\n",
       "      <td>297228</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Start date   End date    Train  New users  New users interactions  \\\n",
       "0 2021-08-01 2021-08-08  4203885      53408                  112764   \n",
       "1 2021-08-08 2021-08-15  4587708      54662                  111580   \n",
       "2 2021-08-15 2021-08-22  4985269      56014                  116629   \n",
       "\n",
       "   New items  New items interactions  Known interactions    Test  \n",
       "0        174                    7020                   0  264039  \n",
       "1        152                    9282                   0  276699  \n",
       "2        114                    5954                   0  297228  "
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "folds_info_with_stats"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Популярное на фолдах"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_N = 10\n",
    "last_n_days = 7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "final_results = []\n",
    "validation_results = pd.DataFrame()\n",
    "\n",
    "for train_idx, test_idx, info in folds_with_stats:\n",
    "    train = interactions_df.loc[train_idx]\n",
    "    test = interactions_df.loc[test_idx]\n",
    "        \n",
    "    pop_model = PopularRecommender(days=last_n_days, dt_column='last_watch_dt')\n",
    "    pop_model.fit(train)\n",
    "\n",
    "    recs = pd.DataFrame({'user_id': test['user_id'].unique()})\n",
    "    recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)\n",
    "    recs = recs.explode('item_id')\n",
    "    recs['rank'] = recs.groupby('user_id').cumcount() + 1\n",
    "\n",
    "    fold_result = compute_metrics(train, test, recs, top_N)\n",
    "\n",
    "    validation_results = validation_results.append(fold_result, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MAP@10        0.075772\n",
       "Novelty@10    4.175400\n",
       "dtype: float64"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "validation_results.agg({'MAP@10':'mean', 'Novelty@10':'mean'})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Соцдем популярное "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Посмотрим, имеет ли смысл предсказывать популярное в зависимости от соц.группы"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_idx, test_idx, info = folds_with_stats[0]\n",
    "train = interactions_df.loc[train_idx]\n",
    "test = interactions_df.loc[test_idx]\n",
    "date_window_for_popular = train['last_watch_dt'].max() - pd.DateOffset(days=last_n_days)\n",
    "train_slice = pd.merge(train[train['last_watch_dt'] >= date_window_for_popular], users_df, on='user_id', how='left')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Как мы помним из предыдущего ноутбука, у нас есть пользователи без фичей, поэтому для них надо определить заполнение "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>item_id</th>\n",
       "      <th>last_watch_dt</th>\n",
       "      <th>total_dur</th>\n",
       "      <th>watched_pct</th>\n",
       "      <th>age</th>\n",
       "      <th>income</th>\n",
       "      <th>sex</th>\n",
       "      <th>kids_flg</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>791466</td>\n",
       "      <td>8199</td>\n",
       "      <td>2021-07-27</td>\n",
       "      <td>713</td>\n",
       "      <td>9</td>\n",
       "      <td>age_18_24</td>\n",
       "      <td>income_20_40</td>\n",
       "      <td>F</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>81786</td>\n",
       "      <td>2616</td>\n",
       "      <td>2021-07-24</td>\n",
       "      <td>41422</td>\n",
       "      <td>90</td>\n",
       "      <td>age_35_44</td>\n",
       "      <td>income_20_40</td>\n",
       "      <td>F</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>161176</td>\n",
       "      <td>10440</td>\n",
       "      <td>2021-07-29</td>\n",
       "      <td>22</td>\n",
       "      <td>0</td>\n",
       "      <td>age_25_34</td>\n",
       "      <td>income_0_20</td>\n",
       "      <td>F</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>513902</td>\n",
       "      <td>3614</td>\n",
       "      <td>2021-07-24</td>\n",
       "      <td>1164</td>\n",
       "      <td>5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>568405</td>\n",
       "      <td>15297</td>\n",
       "      <td>2021-07-30</td>\n",
       "      <td>15298</td>\n",
       "      <td>100</td>\n",
       "      <td>age_18_24</td>\n",
       "      <td>income_40_60</td>\n",
       "      <td>F</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id  item_id last_watch_dt  total_dur  watched_pct        age  \\\n",
       "0   791466     8199    2021-07-27        713            9  age_18_24   \n",
       "1    81786     2616    2021-07-24      41422           90  age_35_44   \n",
       "2   161176    10440    2021-07-29         22            0  age_25_34   \n",
       "3   513902     3614    2021-07-24       1164            5        NaN   \n",
       "4   568405    15297    2021-07-30      15298          100  age_18_24   \n",
       "\n",
       "         income  sex kids_flg  \n",
       "0  income_20_40    F    False  \n",
       "1  income_20_40    F     True  \n",
       "2   income_0_20    F    False  \n",
       "3           NaN  NaN      NaN  \n",
       "4  income_40_60    F    False  "
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_slice.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_slice.fillna({'age':'age_unknown',\n",
    "                    'sex':'sex_unknown',\n",
    "                    'income': 'income_unknown',\n",
    "                    'kids_flg': False\n",
    "                   }, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Например, можно смотреть популярное в разрезе возраста, пола и наличия детей"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>item_id</th>\n",
       "      <th>last_watch_dt</th>\n",
       "      <th>total_dur</th>\n",
       "      <th>watched_pct</th>\n",
       "      <th>age</th>\n",
       "      <th>income</th>\n",
       "      <th>sex</th>\n",
       "      <th>kids_flg</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>791466</td>\n",
       "      <td>8199</td>\n",
       "      <td>2021-07-27</td>\n",
       "      <td>713</td>\n",
       "      <td>9</td>\n",
       "      <td>age_18_24</td>\n",
       "      <td>income_20_40</td>\n",
       "      <td>F</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>81786</td>\n",
       "      <td>2616</td>\n",
       "      <td>2021-07-24</td>\n",
       "      <td>41422</td>\n",
       "      <td>90</td>\n",
       "      <td>age_35_44</td>\n",
       "      <td>income_20_40</td>\n",
       "      <td>F</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>161176</td>\n",
       "      <td>10440</td>\n",
       "      <td>2021-07-29</td>\n",
       "      <td>22</td>\n",
       "      <td>0</td>\n",
       "      <td>age_25_34</td>\n",
       "      <td>income_0_20</td>\n",
       "      <td>F</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>513902</td>\n",
       "      <td>3614</td>\n",
       "      <td>2021-07-24</td>\n",
       "      <td>1164</td>\n",
       "      <td>5</td>\n",
       "      <td>age_unknown</td>\n",
       "      <td>income_unknown</td>\n",
       "      <td>sex_unknown</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>568405</td>\n",
       "      <td>15297</td>\n",
       "      <td>2021-07-30</td>\n",
       "      <td>15298</td>\n",
       "      <td>100</td>\n",
       "      <td>age_18_24</td>\n",
       "      <td>income_40_60</td>\n",
       "      <td>F</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id  item_id last_watch_dt  total_dur  watched_pct          age  \\\n",
       "0   791466     8199    2021-07-27        713            9    age_18_24   \n",
       "1    81786     2616    2021-07-24      41422           90    age_35_44   \n",
       "2   161176    10440    2021-07-29         22            0    age_25_34   \n",
       "3   513902     3614    2021-07-24       1164            5  age_unknown   \n",
       "4   568405    15297    2021-07-30      15298          100    age_18_24   \n",
       "\n",
       "           income          sex  kids_flg  \n",
       "0    income_20_40            F     False  \n",
       "1    income_20_40            F      True  \n",
       "2     income_0_20            F     False  \n",
       "3  income_unknown  sex_unknown     False  \n",
       "4    income_40_60            F     False  "
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_slice.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "soc_dem_recommendations = train_slice.groupby(\n",
    "    ['age', 'sex', 'income', 'item_id']\n",
    ").size().to_frame().reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>income</th>\n",
       "      <th>item_id</th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>age_18_24</td>\n",
       "      <td>F</td>\n",
       "      <td>income_0_20</td>\n",
       "      <td>14</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>age_18_24</td>\n",
       "      <td>F</td>\n",
       "      <td>income_0_20</td>\n",
       "      <td>24</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>age_18_24</td>\n",
       "      <td>F</td>\n",
       "      <td>income_0_20</td>\n",
       "      <td>28</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>age_18_24</td>\n",
       "      <td>F</td>\n",
       "      <td>income_0_20</td>\n",
       "      <td>85</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>age_18_24</td>\n",
       "      <td>F</td>\n",
       "      <td>income_0_20</td>\n",
       "      <td>98</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74358</th>\n",
       "      <td>age_unknown</td>\n",
       "      <td>sex_unknown</td>\n",
       "      <td>income_unknown</td>\n",
       "      <td>16499</td>\n",
       "      <td>32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74359</th>\n",
       "      <td>age_unknown</td>\n",
       "      <td>sex_unknown</td>\n",
       "      <td>income_unknown</td>\n",
       "      <td>16505</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74360</th>\n",
       "      <td>age_unknown</td>\n",
       "      <td>sex_unknown</td>\n",
       "      <td>income_unknown</td>\n",
       "      <td>16506</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74361</th>\n",
       "      <td>age_unknown</td>\n",
       "      <td>sex_unknown</td>\n",
       "      <td>income_unknown</td>\n",
       "      <td>16509</td>\n",
       "      <td>199</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74362</th>\n",
       "      <td>age_unknown</td>\n",
       "      <td>sex_unknown</td>\n",
       "      <td>income_unknown</td>\n",
       "      <td>16516</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>74363 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "               age          sex          income  item_id    0\n",
       "0        age_18_24            F     income_0_20       14    7\n",
       "1        age_18_24            F     income_0_20       24    1\n",
       "2        age_18_24            F     income_0_20       28    1\n",
       "3        age_18_24            F     income_0_20       85    1\n",
       "4        age_18_24            F     income_0_20       98    1\n",
       "...            ...          ...             ...      ...  ...\n",
       "74358  age_unknown  sex_unknown  income_unknown    16499   32\n",
       "74359  age_unknown  sex_unknown  income_unknown    16505    2\n",
       "74360  age_unknown  sex_unknown  income_unknown    16506    1\n",
       "74361  age_unknown  sex_unknown  income_unknown    16509  199\n",
       "74362  age_unknown  sex_unknown  income_unknown    16516    4\n",
       "\n",
       "[74363 rows x 5 columns]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soc_dem_recommendations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Теперь надо просто для каждого пользователя выбрать самое популярные top_n объектов в его группе"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Можем проверить этот вариант на фолдах"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "validation_results = pd.DataFrame()\n",
    "\n",
    "for train_idx, test_idx, info in folds_with_stats:\n",
    "    train = interactions_df.loc[train_idx]\n",
    "    test = interactions_df.loc[test_idx]\n",
    "    date_window = train['last_watch_dt'].max() - pd.DateOffset(days=last_n_days)\n",
    "    train_slice = pd.merge(train[train['last_watch_dt'] >= date_window], users_df, on='user_id', how='left')\n",
    "    \n",
    "    train_slice.fillna({\n",
    "        'age':'age_unknown',\n",
    "        'sex':'sex_unknown',\n",
    "        'income': 'income_unknown',\n",
    "        'kids_flg': False\n",
    "    },inplace=True)\n",
    "    \n",
    "    soc_dem_recommendations = train_slice.groupby(\n",
    "        ['age', 'sex', 'income', 'item_id']\n",
    "    ).size().to_frame().reset_index()\n",
    "    \n",
    "    top_soc_dem = []\n",
    "\n",
    "    for age in soc_dem_recommendations.age.unique():\n",
    "        for income in soc_dem_recommendations.income.unique():\n",
    "            for sex in soc_dem_recommendations.sex.unique():\n",
    "                top_items = soc_dem_recommendations[\n",
    "                (soc_dem_recommendations.age == age)\n",
    "                & (soc_dem_recommendations.income == income)\n",
    "                & (soc_dem_recommendations.sex == sex)].sort_values(0, ascending=False).head(10).item_id.values\n",
    "                top_soc_dem.append([age, income, sex, top_items])\n",
    "\n",
    "    top_soc_dem = pd.DataFrame(top_soc_dem, columns = ['age', 'income', 'sex', 'item_id'])\n",
    "    \n",
    "    recs = pd.DataFrame({'user_id': test['user_id'].unique()})\n",
    "    recs = pd.merge(recs[['user_id']], users_df, on='user_id', how='left')\n",
    "    recs.fillna({\n",
    "        'age':'age_unknown',\n",
    "        'sex':'sex_unknown',\n",
    "        'income': 'income_unknown',\n",
    "        'kids_flg': False\n",
    "    }, inplace=True)\n",
    "    \n",
    "    recs = pd.merge(recs, top_soc_dem, on = ['age', 'sex', 'income'], how = 'left')\n",
    "    recs = recs.drop(columns = ['age', 'sex', 'income'])\n",
    "    \n",
    "    recs = recs.explode('item_id')\n",
    "    recs['rank'] = recs.groupby('user_id').cumcount() + 1\n",
    "    fold_result = compute_metrics(train, test, recs, top_N)\n",
    "    \n",
    "    validation_results = validation_results.append(fold_result, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MAP@10        0.080486\n",
       "Novelty@10    4.237977\n",
       "dtype: float64"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "validation_results.agg({'MAP@10':'mean', 'Novelty@10':'mean'})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "В данном случае признаки, по которым вы строите популярное, подбираются, также, как и кол-во дней, которое вы берете для расчета популярного "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tfidf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "users_inv_mapping = dict(enumerate(interactions_df['user_id'].unique()))\n",
    "users_mapping = {v: k for k, v in users_inv_mapping.items()}\n",
    "\n",
    "items_inv_mapping = dict(enumerate(interactions_df['item_id'].unique()))\n",
    "items_mapping = {v: k for k, v in items_inv_mapping.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "validation_results = pd.DataFrame()\n",
    "\n",
    "for train_idx, test_idx, info in folds_with_stats:\n",
    "    train = interactions_df.loc[train_idx]\n",
    "\n",
    "    date_window = train['last_watch_dt'].max() - pd.DateOffset(days=60)\n",
    "    train = train[train['last_watch_dt'] >= date_window]\n",
    "\n",
    "    test = interactions_df.loc[test_idx]\n",
    "\n",
    "    train_mat = get_coo_matrix(\n",
    "        train,\n",
    "        users_mapping=users_mapping,\n",
    "        items_mapping=items_mapping,\n",
    "    ).tocsr()\n",
    "\n",
    "    model = TFIDFRecommender(K=top_N)\n",
    "    model.fit(train_mat.T, show_progress=False) \n",
    "\n",
    "    mapper = generate_implicit_recs_mapper( \n",
    "        model,\n",
    "        train_mat,\n",
    "        top_N,\n",
    "        users_mapping,\n",
    "        items_inv_mapping,\n",
    "        filter_already_liked_items=True\n",
    "    )\n",
    "\n",
    "    recs = pd.DataFrame({'user_id': test['user_id'].unique()})\n",
    "    recs['item_id'] = recs['user_id'].map(mapper)\n",
    "    recs = recs.explode('item_id')\n",
    "    recs['rank'] = recs.groupby('user_id').cumcount() + 1\n",
    "    fold_result = compute_metrics(train, test, recs, top_N)\n",
    "\n",
    "    validation_results = validation_results.append(fold_result, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MAP@10        0.084463\n",
       "Novelty@10    6.805098\n",
       "dtype: float64"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "validation_results.agg({'MAP@10':'mean', 'Novelty@10':'mean',})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Просто использовать код выше для submission не получится из-за холодных пользователей. Придется придумать, как их обработать."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Пример submission"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission = pd.read_csv('sample_submission.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = interactions_df\n",
    "test = submission\n",
    "\n",
    "pop_model = PopularRecommender(days=last_n_days, dt_column='last_watch_dt')\n",
    "pop_model.fit(train)\n",
    "\n",
    "recs = pd.DataFrame({'user_id': test['user_id'].unique()})\n",
    "recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)\n",
    "recs = recs.explode('item_id')\n",
    "recs['rank'] = recs.groupby('user_id').cumcount() + 1\n",
    "recs = recs.groupby('user_id').agg({'item_id': list}).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>item_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>[9728, 15297, 10440, 14488, 13865, 12192, 341,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>11</td>\n",
       "      <td>[9728, 15297, 10440, 14488, 13865, 12192, 341,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>29</td>\n",
       "      <td>[9728, 15297, 10440, 14488, 13865, 12192, 341,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>30</td>\n",
       "      <td>[9728, 15297, 10440, 14488, 13865, 12192, 341,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>33</td>\n",
       "      <td>[9728, 15297, 10440, 14488, 13865, 12192, 341,...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id                                            item_id\n",
       "0        3  [9728, 15297, 10440, 14488, 13865, 12192, 341,...\n",
       "1       11  [9728, 15297, 10440, 14488, 13865, 12192, 341,...\n",
       "2       29  [9728, 15297, 10440, 14488, 13865, 12192, 341,...\n",
       "3       30  [9728, 15297, 10440, 14488, 13865, 12192, 341,...\n",
       "4       33  [9728, 15297, 10440, 14488, 13865, 12192, 341,..."
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "recs.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "recs.to_csv('sample_submission.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "churn",
   "language": "python",
   "name": "churn"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}