## Libs

In [1]:
import random
import datetime
import numpy as np

import scipy.sparse as sp
import pandas as pd

from itertools import islice, cycle
from more_itertools import pairwise
from implicit.nearest_neighbours import TFIDFRecommender

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import seaborn as sns
sns.set(style='whitegrid')
sns.set(rc={'figure.figsize':(17, 9)})

from IPython.core.display import display, HTML, clear_output
display(HTML('<style>.container { width:80% !important; }</style>'))
display(HTML('<style>.prompt { min-width:10ex !important; }</style>'))
display(HTML('<style>div#notebook { font-size:12px !important; }</style>'))

## Functions

In [2]:
def calculate_novelty(train_interactions, recommendations, top_n): 
    users = recommendations['user_id'].unique()
    n_users = train_interactions['user_id'].nunique()
    n_users_per_item = train_interactions.groupby('item_id')['user_id'].nunique()

    recommendations = recommendations.loc[recommendations['rank'] <= top_n].copy()
    recommendations['n_users_per_item'] = recommendations['item_id'].map(n_users_per_item)
    recommendations['n_users_per_item'] = recommendations['n_users_per_item'].fillna(1)
    recommendations['item_novelty'] = -np.log2(recommendations['n_users_per_item'] / n_users)

    item_novelties = recommendations[['user_id', 'rank', 'item_novelty']]
    
    miuf_at_k = item_novelties.loc[item_novelties['rank'] <= top_n, ['user_id', 'item_novelty']]
    miuf_at_k = miuf_at_k.groupby('user_id').agg('mean').squeeze()

    return miuf_at_k.reindex(users).mean()

In [3]:
def compute_metrics(train, test, recs, top_N):
    result = {}
    test_recs = test.set_index(['user_id', 'item_id']).join(recs.set_index(['user_id', 'item_id']))
    test_recs = test_recs.sort_values(by=['user_id', 'rank'])

    test_recs['users_item_count'] = test_recs.groupby(level='user_id')['rank'].transform(np.size)
    test_recs['reciprocal_rank'] = (1 / test_recs['rank']).fillna(0)
    test_recs['cumulative_rank'] = test_recs.groupby(level='user_id').cumcount() + 1
    test_recs['cumulative_rank'] = test_recs['cumulative_rank'] / test_recs['rank']
    
    users_count = test_recs.index.get_level_values('user_id').nunique()
    
    # Uncomment for Precision/Recall at k results

#     for k in range(1, top_N + 1):
#         hit_k = f'hit@{k}'
#         test_recs[hit_k] = test_recs['rank'] <= k
#         result[f'Precision@{k}'] = (test_recs[hit_k] / k).sum() / users_count
#         result[f'Recall@{k}'] = (test_recs[hit_k] / test_recs['users_item_count']).sum() / users_count
        
    result[f'MAP@{top_N}'] = (test_recs['cumulative_rank'] / test_recs['users_item_count']).sum() / users_count
    result[f'Novelty@{top_N}'] = calculate_novelty(train, recs, top_N)
    
    return pd.Series(result)

In [4]:
class TimeRangeSplit():
    """
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.date_range.html
    """
    def __init__(self, 
                 start_date, 
                 end_date=None, 
                 freq='D', 
                 periods=None, 
                 tz=None, 
                 normalize=False, 
                 closed=None, 
                 train_min_date=None,
                 filter_cold_users=True, 
                 filter_cold_items=True, 
                 filter_already_seen=True):
        
        self.start_date = start_date
        if end_date is None and periods is None:
            raise ValueError('Either "end_date" or "periods" must be non-zero, not both at the same time.')

        self.end_date = end_date
        self.freq = freq
        self.periods = periods
        self.tz = tz
        self.normalize = normalize
        self.closed = closed
        self.train_min_date = pd.to_datetime(train_min_date, errors='raise')
        self.filter_cold_users = filter_cold_users
        self.filter_cold_items = filter_cold_items
        self.filter_already_seen = filter_already_seen

        self.date_range = pd.date_range(
            start=start_date, 
            end=end_date, 
            freq=freq, 
            periods=periods, 
            tz=tz, 
            normalize=normalize, 
            closed=closed)

        self.max_n_splits = max(0, len(self.date_range) - 1)
        if self.max_n_splits == 0:
            raise ValueError('Provided parametrs set an empty date range.') 

    def split(self, 
              df, 
              user_column='user_id',
              item_column='item_id',
              datetime_column='date',
              fold_stats=False):
        df_datetime = df[datetime_column]
        if self.train_min_date is not None:
            train_min_mask = df_datetime >= self.train_min_date
        else:
            train_min_mask = df_datetime.notnull()

        date_range = self.date_range[(self.date_range >= df_datetime.min()) & 
                                     (self.date_range <= df_datetime.max())]

        for start, end in pairwise(date_range):
            fold_info = {
                'Start date': start,
                'End date': end
            }
            train_mask = train_min_mask & (df_datetime < start)
            train_idx = df.index[train_mask]
            if fold_stats:
                fold_info['Train'] = len(train_idx)

            test_mask = (df_datetime >= start) & (df_datetime < end)
            test_idx = df.index[test_mask]
            
            if self.filter_cold_users:
                new = np.setdiff1d(
                    df.loc[test_idx, user_column].unique(), 
                    df.loc[train_idx, user_column].unique())
                new_idx = df.index[test_mask & df[user_column].isin(new)]
                test_idx = np.setdiff1d(test_idx, new_idx)
                test_mask = df.index.isin(test_idx)
                if fold_stats:
                    fold_info['New users'] = len(new)
                    fold_info['New users interactions'] = len(new_idx)

            if self.filter_cold_items:
                new = np.setdiff1d(
                    df.loc[test_idx, item_column].unique(), 
                    df.loc[train_idx, item_column].unique())
                new_idx = df.index[test_mask & df[item_column].isin(new)]
                test_idx = np.setdiff1d(test_idx, new_idx)
                test_mask = df.index.isin(test_idx)
                if fold_stats:
                    fold_info['New items'] = len(new)
                    fold_info['New items interactions'] = len(new_idx)

            if self.filter_already_seen:
                user_item = [user_column, item_column]
                train_pairs = df.loc[train_idx, user_item].set_index(user_item).index
                test_pairs = df.loc[test_idx, user_item].set_index(user_item).index
                intersection = train_pairs.intersection(test_pairs)
                print(f'Already seen number: {len(intersection)}')
                test_idx = test_idx[~test_pairs.isin(intersection)]
                # test_mask = rd.df.index.isin(test_idx)
                if fold_stats:
                    fold_info['Known interactions'] = len(intersection)

            if fold_stats:
                fold_info['Test'] = len(test_idx)

            yield (train_idx, test_idx, fold_info)

    def get_n_splits(self, df, datetime_column='date'):
        df_datetime = df[datetime_column]
        if self.train_min_date is not None:
            df_datetime = df_datetime[df_datetime >= self.train_min_date]

        date_range = self.date_range[(self.date_range >= df_datetime.min()) & 
                                     (self.date_range <= df_datetime.max())]

        return max(0, len(date_range) - 1)

In [5]:
def get_coo_matrix(df, 
                   user_col='user_id', 
                   item_col='item_id', 
                   weight_col=None, 
                   users_mapping={}, 
                   items_mapping={}):
    
    if weight_col is None:
        weights = np.ones(len(df), dtype=np.float32)
    else:
        weights = df[weight_col].astype(np.float32)

    interaction_matrix = sp.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(items_mapping.get)
        )
    ))
    return interaction_matrix

In [6]:
def generate_implicit_recs_mapper(
    model,
    train_matrix,
    top_N,
    user_mapping,
    item_inv_mapping,
    filter_already_liked_items
):
    def _recs_mapper(user):
        user_id = user_mapping[user]
        recs = model.recommend(user_id, 
                               train_matrix, 
                               N=top_N, 
                               filter_already_liked_items=filter_already_liked_items)
        return [item_inv_mapping[item] for item, _ in recs]
    return _recs_mapper

## Getting data

In [7]:
users_df = pd.read_csv('users_processed.csv',)
items_df = pd.read_csv('items_processed.csv',)
interactions_df = pd.read_csv('interactions_processed.csv', parse_dates=['last_watch_dt'])

# Baseline - популярное

In [8]:
class PopularRecommender():
    def __init__(self, max_K=10, days=30, item_column='item_id', dt_column='date'):
        self.max_K = max_K
        self.days = days
        self.item_column = item_column
        self.dt_column = dt_column
        self.recommendations = []
        
    def fit(self, df, ):
        min_date = df[self.dt_column].max().normalize() - pd.DateOffset(days=self.days)
        self.recommendations = df.loc[df[self.dt_column] > min_date, self.item_column].value_counts().head(self.max_K).index.values
    
    def recommend(self, users=None, N=10):
        recs = self.recommendations[:N]
        if users is None:
            return recs
        else:
            return list(islice(cycle([recs]), len(users)))

### Пример на одном фолде

In [9]:
test = interactions_df[interactions_df['last_watch_dt'] == interactions_df['last_watch_dt'].max()]
train = interactions_df[interactions_df['last_watch_dt'] < interactions_df['last_watch_dt'].max()]

In [10]:
pop_model = PopularRecommender(days=7, dt_column='last_watch_dt')
pop_model.fit(train)

In [11]:
top10_recs = pop_model.recommend()
top10_recs

array([ 9728, 15297, 10440, 13865, 12360, 14488, 12192,   341,   512,
        4151])

In [12]:
item_titles = pd.Series(items_df['title'].values, index=items_df['item_id']).to_dict()

In [13]:
list(map(item_titles.get, top10_recs))

['гнев человеческий',
 'клиника счастья',
 'хрустальный',
 'девятаев',
 'круэлла',
 'мастер меча',
 'фемида видит',
 'лето - это море',
 'рядовой чээрин',
 'секреты семейной жизни']

In [14]:
recs = pd.DataFrame({'user_id': test['user_id'].unique()})
top_N = 10
recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)
recs.head()

Unnamed: 0,user_id,item_id
0,203219,"[9728, 15297, 10440, 13865, 12360, 14488, 1219..."
1,125519,"[9728, 15297, 10440, 13865, 12360, 14488, 1219..."
2,626036,"[9728, 15297, 10440, 13865, 12360, 14488, 1219..."
3,1029980,"[9728, 15297, 10440, 13865, 12360, 14488, 1219..."
4,830261,"[9728, 15297, 10440, 13865, 12360, 14488, 1219..."


In [15]:
recs = recs.explode('item_id')

In [16]:
recs['rank'] = recs.groupby('user_id').cumcount() + 1
recs.head(top_N + 2)

Unnamed: 0,user_id,item_id,rank
0,203219,9728,1
0,203219,15297,2
0,203219,10440,3
0,203219,13865,4
0,203219,12360,5
0,203219,14488,6
0,203219,12192,7
0,203219,341,8
0,203219,512,9
0,203219,4151,10


In [17]:
compute_metrics(train, test, recs, 10)

MAP@10        0.089383
Novelty@10    4.528709
dtype: float64

# Валидация на фолдах

Возьмем 3 последние недели из наших данных, и будем тестировать на них последовательно (1 test fold - 1 неделя).

Не забывайте про проблему холодного старта.

In [18]:
last_date = interactions_df['last_watch_dt'].max().normalize()
folds = 3
start_date = last_date - pd.Timedelta(days=folds*7)
start_date, last_date

(Timestamp('2021-08-01 00:00:00'), Timestamp('2021-08-22 00:00:00'))

In [19]:
cv = TimeRangeSplit(start_date=start_date, periods=folds+1, freq='W')

cv.max_n_splits, cv.get_n_splits(interactions_df, datetime_column='last_watch_dt')

(3, 3)

In [20]:
cv.date_range

DatetimeIndex(['2021-08-01', '2021-08-08', '2021-08-15', '2021-08-22'], dtype='datetime64[ns]', freq='W-SUN')

In [21]:
folds_with_stats = list(cv.split(
    interactions_df, 
    user_column='user_id',
    item_column='item_id',
    datetime_column='last_watch_dt',
    fold_stats=True
))

folds_info_with_stats = pd.DataFrame([info for _, _, info in folds_with_stats])

Already seen number: 0
Already seen number: 0
Already seen number: 0


In [22]:
folds_info_with_stats

Unnamed: 0,Start date,End date,Train,New users,New users interactions,New items,New items interactions,Known interactions,Test
0,2021-08-01,2021-08-08,4203885,53408,112764,174,7020,0,264039
1,2021-08-08,2021-08-15,4587708,54662,111580,152,9282,0,276699
2,2021-08-15,2021-08-22,4985269,56014,116629,114,5954,0,297228


# Популярное на фолдах

In [23]:
top_N = 10
last_n_days = 7

In [24]:
final_results = []
validation_results = pd.DataFrame()

for train_idx, test_idx, info in folds_with_stats:
    train = interactions_df.loc[train_idx]
    test = interactions_df.loc[test_idx]
        
    pop_model = PopularRecommender(days=last_n_days, dt_column='last_watch_dt')
    pop_model.fit(train)

    recs = pd.DataFrame({'user_id': test['user_id'].unique()})
    recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)
    recs = recs.explode('item_id')
    recs['rank'] = recs.groupby('user_id').cumcount() + 1

    fold_result = compute_metrics(train, test, recs, top_N)

    validation_results = validation_results.append(fold_result, ignore_index=True)

In [25]:
validation_results.agg({'MAP@10':'mean', 'Novelty@10':'mean'})

MAP@10        0.075772
Novelty@10    4.175400
dtype: float64

# Соцдем популярное 

Посмотрим, имеет ли смысл предсказывать популярное в зависимости от соц.группы

In [26]:
train_idx, test_idx, info = folds_with_stats[0]
train = interactions_df.loc[train_idx]
test = interactions_df.loc[test_idx]
date_window_for_popular = train['last_watch_dt'].max() - pd.DateOffset(days=last_n_days)
train_slice = pd.merge(train[train['last_watch_dt'] >= date_window_for_popular], users_df, on='user_id', how='left')

Как мы помним из предыдущего ноутбука, у нас есть пользователи без фичей, поэтому для них надо определить заполнение 

In [27]:
train_slice.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,age,income,sex,kids_flg
0,791466,8199,2021-07-27,713,9,age_18_24,income_20_40,F,False
1,81786,2616,2021-07-24,41422,90,age_35_44,income_20_40,F,True
2,161176,10440,2021-07-29,22,0,age_25_34,income_0_20,F,False
3,513902,3614,2021-07-24,1164,5,,,,
4,568405,15297,2021-07-30,15298,100,age_18_24,income_40_60,F,False


In [28]:
train_slice.fillna({'age':'age_unknown',
                    'sex':'sex_unknown',
                    'income': 'income_unknown',
                    'kids_flg': False
                   }, inplace=True)

Например, можно смотреть популярное в разрезе возраста, пола и наличия детей

In [29]:
train_slice.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,age,income,sex,kids_flg
0,791466,8199,2021-07-27,713,9,age_18_24,income_20_40,F,False
1,81786,2616,2021-07-24,41422,90,age_35_44,income_20_40,F,True
2,161176,10440,2021-07-29,22,0,age_25_34,income_0_20,F,False
3,513902,3614,2021-07-24,1164,5,age_unknown,income_unknown,sex_unknown,False
4,568405,15297,2021-07-30,15298,100,age_18_24,income_40_60,F,False


In [30]:
soc_dem_recommendations = train_slice.groupby(
    ['age', 'sex', 'income', 'item_id']
).size().to_frame().reset_index()

In [31]:
soc_dem_recommendations

Unnamed: 0,age,sex,income,item_id,0
0,age_18_24,F,income_0_20,14,7
1,age_18_24,F,income_0_20,24,1
2,age_18_24,F,income_0_20,28,1
3,age_18_24,F,income_0_20,85,1
4,age_18_24,F,income_0_20,98,1
...,...,...,...,...,...
74358,age_unknown,sex_unknown,income_unknown,16499,32
74359,age_unknown,sex_unknown,income_unknown,16505,2
74360,age_unknown,sex_unknown,income_unknown,16506,1
74361,age_unknown,sex_unknown,income_unknown,16509,199


Теперь надо просто для каждого пользователя выбрать самое популярные top_n объектов в его группе

Можем проверить этот вариант на фолдах

In [32]:
validation_results = pd.DataFrame()

for train_idx, test_idx, info in folds_with_stats:
    train = interactions_df.loc[train_idx]
    test = interactions_df.loc[test_idx]
    date_window = train['last_watch_dt'].max() - pd.DateOffset(days=last_n_days)
    train_slice = pd.merge(train[train['last_watch_dt'] >= date_window], users_df, on='user_id', how='left')
    
    train_slice.fillna({
        'age':'age_unknown',
        'sex':'sex_unknown',
        'income': 'income_unknown',
        'kids_flg': False
    },inplace=True)
    
    soc_dem_recommendations = train_slice.groupby(
        ['age', 'sex', 'income', 'item_id']
    ).size().to_frame().reset_index()
    
    top_soc_dem = []

    for age in soc_dem_recommendations.age.unique():
        for income in soc_dem_recommendations.income.unique():
            for sex in soc_dem_recommendations.sex.unique():
                top_items = soc_dem_recommendations[
                (soc_dem_recommendations.age == age)
                & (soc_dem_recommendations.income == income)
                & (soc_dem_recommendations.sex == sex)].sort_values(0, ascending=False).head(10).item_id.values
                top_soc_dem.append([age, income, sex, top_items])

    top_soc_dem = pd.DataFrame(top_soc_dem, columns = ['age', 'income', 'sex', 'item_id'])
    
    recs = pd.DataFrame({'user_id': test['user_id'].unique()})
    recs = pd.merge(recs[['user_id']], users_df, on='user_id', how='left')
    recs.fillna({
        'age':'age_unknown',
        'sex':'sex_unknown',
        'income': 'income_unknown',
        'kids_flg': False
    }, inplace=True)
    
    recs = pd.merge(recs, top_soc_dem, on = ['age', 'sex', 'income'], how = 'left')
    recs = recs.drop(columns = ['age', 'sex', 'income'])
    
    recs = recs.explode('item_id')
    recs['rank'] = recs.groupby('user_id').cumcount() + 1
    fold_result = compute_metrics(train, test, recs, top_N)
    
    validation_results = validation_results.append(fold_result, ignore_index=True)

In [33]:
validation_results.agg({'MAP@10':'mean', 'Novelty@10':'mean'})

MAP@10        0.080486
Novelty@10    4.237977
dtype: float64

В данном случае признаки, по которым вы строите популярное, подбираются, также, как и кол-во дней, которое вы берете для расчета популярного 

# Tfidf

In [34]:
users_inv_mapping = dict(enumerate(interactions_df['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}

items_inv_mapping = dict(enumerate(interactions_df['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

In [35]:
validation_results = pd.DataFrame()

for train_idx, test_idx, info in folds_with_stats:
    train = interactions_df.loc[train_idx]

    date_window = train['last_watch_dt'].max() - pd.DateOffset(days=60)
    train = train[train['last_watch_dt'] >= date_window]

    test = interactions_df.loc[test_idx]

    train_mat = get_coo_matrix(
        train,
        users_mapping=users_mapping,
        items_mapping=items_mapping,
    ).tocsr()

    model = TFIDFRecommender(K=top_N)
    model.fit(train_mat.T, show_progress=False) 

    mapper = generate_implicit_recs_mapper( 
        model,
        train_mat,
        top_N,
        users_mapping,
        items_inv_mapping,
        filter_already_liked_items=True
    )

    recs = pd.DataFrame({'user_id': test['user_id'].unique()})
    recs['item_id'] = recs['user_id'].map(mapper)
    recs = recs.explode('item_id')
    recs['rank'] = recs.groupby('user_id').cumcount() + 1
    fold_result = compute_metrics(train, test, recs, top_N)

    validation_results = validation_results.append(fold_result, ignore_index=True)

In [36]:
validation_results.agg({'MAP@10':'mean', 'Novelty@10':'mean',})

MAP@10        0.084463
Novelty@10    6.805098
dtype: float64

Просто использовать код выше для submission не получится из-за холодных пользователей. Придется придумать, как их обработать.

# Пример submission

In [37]:
submission = pd.read_csv('sample_submission.csv')

In [38]:
train = interactions_df
test = submission

pop_model = PopularRecommender(days=last_n_days, dt_column='last_watch_dt')
pop_model.fit(train)

recs = pd.DataFrame({'user_id': test['user_id'].unique()})
recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)
recs = recs.explode('item_id')
recs['rank'] = recs.groupby('user_id').cumcount() + 1
recs = recs.groupby('user_id').agg({'item_id': list}).reset_index()

In [39]:
recs.head()

Unnamed: 0,user_id,item_id
0,3,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
1,11,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
2,29,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
3,30,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
4,33,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."


In [40]:
recs.to_csv('sample_submission.csv', index=False)