{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "_cell_guid": "fb777394-ad80-455a-b740-3316a97103d5",
    "_uuid": "74dae92e7724c0527bcef0c9e0df43c98555ce44",
    "execution": {
     "iopub.execute_input": "2022-04-07T11:43:46.260868Z",
     "iopub.status.busy": "2022-04-07T11:43:46.260236Z",
     "iopub.status.idle": "2022-04-07T11:43:47.771563Z",
     "shell.execute_reply": "2022-04-07T11:43:47.77053Z",
     "shell.execute_reply.started": "2022-04-07T11:43:46.260815Z"
    }
   },
   "outputs": [],
   "source": [
    "# Import libraries and set desired options\n",
    "%matplotlib inline\n",
    "from matplotlib import pyplot as plt\n",
    "\n",
    "import pickle\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.metrics import roc_auc_score\n",
    "from sklearn.linear_model import LogisticRegression"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_cell_guid": "0cf36110-0ecb-4c85-9148-6b7b1e0cf55d",
    "_uuid": "7c5cf8a9330aa1a9f767a56fde8f48fa842e2fe8"
   },
   "source": [
    "Notebook by Yuri Kashnitsky, edited by Ivan Komarov. \n",
    "\n",
    "In this competition we are going to analyze a sequence of websites visited by a person to predict whether this person is Alice or not. The metric of evaluation is [ROC AUC](https://en.wikipedia.org/wiki/Receiver_operating_characteristic). "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_cell_guid": "7e0b3ca3-790c-4bf3-9bec-356d3de9f274",
    "_uuid": "e6a46d89c9b912f5858baa9930e6280b11750892"
   },
   "source": [
    "###  Data Downloading and Transformation\n",
    "First, read the training and test sets. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-04-07T11:43:54.25832Z",
     "iopub.status.busy": "2022-04-07T11:43:54.257675Z",
     "iopub.status.idle": "2022-04-07T11:43:54.26551Z",
     "shell.execute_reply": "2022-04-07T11:43:54.264596Z",
     "shell.execute_reply.started": "2022-04-07T11:43:54.258256Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['time1',\n",
       " 'time2',\n",
       " 'time3',\n",
       " 'time4',\n",
       " 'time5',\n",
       " 'time6',\n",
       " 'time7',\n",
       " 'time8',\n",
       " 'time9',\n",
       " 'time10']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "times = ['time'+str(i) for i in range(1,11)]\n",
    "times"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "_cell_guid": "a46803c1-86dc-4a19-9007-17e0da493a05",
    "_uuid": "eb4a1b566bef6a987555baf485809e94e08c31f6",
    "execution": {
     "iopub.execute_input": "2022-04-07T11:43:47.774521Z",
     "iopub.status.busy": "2022-04-07T11:43:47.77399Z",
     "iopub.status.idle": "2022-04-07T11:43:54.256103Z",
     "shell.execute_reply": "2022-04-07T11:43:54.255122Z",
     "shell.execute_reply.started": "2022-04-07T11:43:47.774464Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>site1</th>\n",
       "      <th>time1</th>\n",
       "      <th>site2</th>\n",
       "      <th>time2</th>\n",
       "      <th>site3</th>\n",
       "      <th>time3</th>\n",
       "      <th>site4</th>\n",
       "      <th>time4</th>\n",
       "      <th>site5</th>\n",
       "      <th>time5</th>\n",
       "      <th>...</th>\n",
       "      <th>time6</th>\n",
       "      <th>site7</th>\n",
       "      <th>time7</th>\n",
       "      <th>site8</th>\n",
       "      <th>time8</th>\n",
       "      <th>site9</th>\n",
       "      <th>time9</th>\n",
       "      <th>site10</th>\n",
       "      <th>time10</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>session_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>27554</th>\n",
       "      <td>41475</td>\n",
       "      <td>2013-11-15 07:39:35</td>\n",
       "      <td>6725.0</td>\n",
       "      <td>2013-11-15 07:39:35</td>\n",
       "      <td>6725.0</td>\n",
       "      <td>2013-11-15 07:39:36</td>\n",
       "      <td>41475.0</td>\n",
       "      <td>2013-11-15 07:39:36</td>\n",
       "      <td>41476.0</td>\n",
       "      <td>2013-11-15 07:39:40</td>\n",
       "      <td>...</td>\n",
       "      <td>2013-11-15 07:39:41</td>\n",
       "      <td>6725.0</td>\n",
       "      <td>2013-11-15 07:42:50</td>\n",
       "      <td>41475.0</td>\n",
       "      <td>2013-11-15 07:42:50</td>\n",
       "      <td>41476.0</td>\n",
       "      <td>2013-11-15 07:42:50</td>\n",
       "      <td>6725.0</td>\n",
       "      <td>2013-11-15 07:44:25</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>81350</th>\n",
       "      <td>41476</td>\n",
       "      <td>2013-11-15 07:44:25</td>\n",
       "      <td>41475.0</td>\n",
       "      <td>2013-11-15 07:44:25</td>\n",
       "      <td>41476.0</td>\n",
       "      <td>2013-11-15 07:57:45</td>\n",
       "      <td>6725.0</td>\n",
       "      <td>2013-11-15 07:57:45</td>\n",
       "      <td>41475.0</td>\n",
       "      <td>2013-11-15 07:57:45</td>\n",
       "      <td>...</td>\n",
       "      <td>2013-11-15 07:57:46</td>\n",
       "      <td>41476.0</td>\n",
       "      <td>2013-11-15 07:57:47</td>\n",
       "      <td>6725.0</td>\n",
       "      <td>2013-11-15 07:57:49</td>\n",
       "      <td>41475.0</td>\n",
       "      <td>2013-11-15 07:57:49</td>\n",
       "      <td>41476.0</td>\n",
       "      <td>2013-11-15 07:57:49</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>234665</th>\n",
       "      <td>4802</td>\n",
       "      <td>2013-11-15 07:52:17</td>\n",
       "      <td>23.0</td>\n",
       "      <td>2013-11-15 07:52:18</td>\n",
       "      <td>4803.0</td>\n",
       "      <td>2013-11-15 07:52:19</td>\n",
       "      <td>38.0</td>\n",
       "      <td>2013-11-15 07:52:19</td>\n",
       "      <td>38.0</td>\n",
       "      <td>2013-11-15 07:52:20</td>\n",
       "      <td>...</td>\n",
       "      <td>2013-11-15 07:52:20</td>\n",
       "      <td>4804.0</td>\n",
       "      <td>2013-11-15 07:52:23</td>\n",
       "      <td>21.0</td>\n",
       "      <td>2013-11-15 07:52:26</td>\n",
       "      <td>23.0</td>\n",
       "      <td>2013-11-15 07:52:26</td>\n",
       "      <td>22.0</td>\n",
       "      <td>2013-11-15 07:52:28</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97610</th>\n",
       "      <td>23</td>\n",
       "      <td>2013-11-15 07:52:28</td>\n",
       "      <td>23.0</td>\n",
       "      <td>2013-11-15 07:52:29</td>\n",
       "      <td>22.0</td>\n",
       "      <td>2013-11-15 07:52:37</td>\n",
       "      <td>21.0</td>\n",
       "      <td>2013-11-15 07:52:37</td>\n",
       "      <td>63.0</td>\n",
       "      <td>2013-11-15 07:55:10</td>\n",
       "      <td>...</td>\n",
       "      <td>2013-11-15 07:55:10</td>\n",
       "      <td>784.0</td>\n",
       "      <td>2013-11-15 07:55:56</td>\n",
       "      <td>4804.0</td>\n",
       "      <td>2013-11-15 07:57:50</td>\n",
       "      <td>4804.0</td>\n",
       "      <td>2013-11-15 08:01:18</td>\n",
       "      <td>784.0</td>\n",
       "      <td>2013-11-15 08:01:26</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>161358</th>\n",
       "      <td>41476</td>\n",
       "      <td>2013-11-15 07:57:50</td>\n",
       "      <td>41476.0</td>\n",
       "      <td>2013-11-15 07:57:51</td>\n",
       "      <td>6725.0</td>\n",
       "      <td>2013-11-15 07:59:34</td>\n",
       "      <td>41475.0</td>\n",
       "      <td>2013-11-15 07:59:34</td>\n",
       "      <td>41476.0</td>\n",
       "      <td>2013-11-15 07:59:34</td>\n",
       "      <td>...</td>\n",
       "      <td>NaT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            site1               time1    site2               time2    site3  \\\n",
       "session_id                                                                    \n",
       "27554       41475 2013-11-15 07:39:35   6725.0 2013-11-15 07:39:35   6725.0   \n",
       "81350       41476 2013-11-15 07:44:25  41475.0 2013-11-15 07:44:25  41476.0   \n",
       "234665       4802 2013-11-15 07:52:17     23.0 2013-11-15 07:52:18   4803.0   \n",
       "97610          23 2013-11-15 07:52:28     23.0 2013-11-15 07:52:29     22.0   \n",
       "161358      41476 2013-11-15 07:57:50  41476.0 2013-11-15 07:57:51   6725.0   \n",
       "\n",
       "                         time3    site4               time4    site5  \\\n",
       "session_id                                                             \n",
       "27554      2013-11-15 07:39:36  41475.0 2013-11-15 07:39:36  41476.0   \n",
       "81350      2013-11-15 07:57:45   6725.0 2013-11-15 07:57:45  41475.0   \n",
       "234665     2013-11-15 07:52:19     38.0 2013-11-15 07:52:19     38.0   \n",
       "97610      2013-11-15 07:52:37     21.0 2013-11-15 07:52:37     63.0   \n",
       "161358     2013-11-15 07:59:34  41475.0 2013-11-15 07:59:34  41476.0   \n",
       "\n",
       "                         time5  ...                 time6    site7  \\\n",
       "session_id                      ...                                  \n",
       "27554      2013-11-15 07:39:40  ...   2013-11-15 07:39:41   6725.0   \n",
       "81350      2013-11-15 07:57:45  ...   2013-11-15 07:57:46  41476.0   \n",
       "234665     2013-11-15 07:52:20  ...   2013-11-15 07:52:20   4804.0   \n",
       "97610      2013-11-15 07:55:10  ...   2013-11-15 07:55:10    784.0   \n",
       "161358     2013-11-15 07:59:34  ...                   NaT      NaN   \n",
       "\n",
       "                         time7    site8               time8    site9  \\\n",
       "session_id                                                             \n",
       "27554      2013-11-15 07:42:50  41475.0 2013-11-15 07:42:50  41476.0   \n",
       "81350      2013-11-15 07:57:47   6725.0 2013-11-15 07:57:49  41475.0   \n",
       "234665     2013-11-15 07:52:23     21.0 2013-11-15 07:52:26     23.0   \n",
       "97610      2013-11-15 07:55:56   4804.0 2013-11-15 07:57:50   4804.0   \n",
       "161358                     NaT      NaN                 NaT      NaN   \n",
       "\n",
       "                         time9   site10              time10 target  \n",
       "session_id                                                          \n",
       "27554      2013-11-15 07:42:50   6725.0 2013-11-15 07:44:25      0  \n",
       "81350      2013-11-15 07:57:49  41476.0 2013-11-15 07:57:49      0  \n",
       "234665     2013-11-15 07:52:26     22.0 2013-11-15 07:52:28      0  \n",
       "97610      2013-11-15 08:01:18    784.0 2013-11-15 08:01:26      0  \n",
       "161358                     NaT      NaN                 NaT      0  \n",
       "\n",
       "[5 rows x 21 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Read the training and test data sets and parse dates\n",
    "train_df = pd.read_csv('train.csv',\n",
    "                       index_col='session_id', parse_dates=times)\n",
    "\n",
    "test_df = pd.read_csv('test.csv',\n",
    "                      index_col='session_id', parse_dates=['time1'])\n",
    "\n",
    "# Sort the data by time\n",
    "train_df = train_df.sort_values(by='time1')\n",
    "\n",
    "# Look at the first rows of the training set\n",
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-04-07T11:43:54.26759Z",
     "iopub.status.busy": "2022-04-07T11:43:54.26702Z",
     "iopub.status.idle": "2022-04-07T11:43:54.3888Z",
     "shell.execute_reply": "2022-04-07T11:43:54.38759Z",
     "shell.execute_reply.started": "2022-04-07T11:43:54.267527Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 253561 entries, 27554 to 11690\n",
      "Data columns (total 21 columns):\n",
      "site1     253561 non-null int64\n",
      "time1     253561 non-null datetime64[ns]\n",
      "site2     250098 non-null float64\n",
      "time2     250098 non-null datetime64[ns]\n",
      "site3     246919 non-null float64\n",
      "time3     246919 non-null datetime64[ns]\n",
      "site4     244321 non-null float64\n",
      "time4     244321 non-null datetime64[ns]\n",
      "site5     241829 non-null float64\n",
      "time5     241829 non-null datetime64[ns]\n",
      "site6     239495 non-null float64\n",
      "time6     239495 non-null datetime64[ns]\n",
      "site7     237297 non-null float64\n",
      "time7     237297 non-null datetime64[ns]\n",
      "site8     235224 non-null float64\n",
      "time8     235224 non-null datetime64[ns]\n",
      "site9     233084 non-null float64\n",
      "time9     233084 non-null datetime64[ns]\n",
      "site10    231052 non-null float64\n",
      "time10    231052 non-null datetime64[ns]\n",
      "target    253561 non-null int64\n",
      "dtypes: datetime64[ns](10), float64(9), int64(2)\n",
      "memory usage: 42.6 MB\n"
     ]
    }
   ],
   "source": [
    "train_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_cell_guid": "ae5799dd-3adc-45e8-a203-27459b4ac388",
    "_uuid": "d22359e8d787e79accf47d05c089c52373c8fb4e"
   },
   "source": [
    "The training data set contains the following features:\n",
    "\n",
    "- **site1** – ID of the first visited website in the session\n",
    "- **time1** – visiting time for the first website in the session\n",
    "- ...\n",
    "- **site10** – ID of the tenth visited website in the session\n",
    "- **time10** – visiting time for the tenth website in the session\n",
    "- **target** – target variable, equals 1 for Alice's sessions, and 0 otherwise\n",
    "    \n",
    "**User sessions end either if a user has visited ten websites or if a session has lasted over thirty minutes.**\n",
    "\n",
    "There are some empty values in the table, it means that some sessions contain less than ten websites. Replace empty values with 0 and change columns types to integer. Also load the websites dictionary and check how it looks:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "_cell_guid": "0f7b0c81-a791-4842-8b05-30f214507f2f",
    "_uuid": "7fb01efc8ae51a6e919382054d0e39ce842cda01",
    "execution": {
     "iopub.execute_input": "2022-04-07T11:43:54.391089Z",
     "iopub.status.busy": "2022-04-07T11:43:54.390736Z",
     "iopub.status.idle": "2022-04-07T11:43:54.561505Z",
     "shell.execute_reply": "2022-04-07T11:43:54.560478Z",
     "shell.execute_reply.started": "2022-04-07T11:43:54.391029Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Websites total: 48371\n"
     ]
    }
   ],
   "source": [
    "# Change site1, ..., site10 columns type to integer and fill NA-values with zeros\n",
    "sites = ['site'+str(i) for i in range(1, 11)]\n",
    "train_df[sites] = train_df[sites].fillna(0).astype('int')\n",
    "test_df[sites] = test_df[sites].fillna(0).astype('int')\n",
    "\n",
    "# Load websites dictionary\n",
    "with open(r\"site_dic.pkl\", \"rb\") as input_file:\n",
    "    site_dict = pickle.load(input_file)\n",
    "    \n",
    "# r before a string means \"raw\", i.e. take the string as it comes,\n",
    "# e.g. as a file path without interpreting special symbols like \\n\n",
    "\n",
    "print('Websites total:', len(site_dict))\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-04-07T11:43:54.565282Z",
     "iopub.status.busy": "2022-04-07T11:43:54.565006Z",
     "iopub.status.idle": "2022-04-07T11:43:54.60317Z",
     "shell.execute_reply": "2022-04-07T11:43:54.602201Z",
     "shell.execute_reply.started": "2022-04-07T11:43:54.565237Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('www.abmecatronique.com', 25075),\n",
       " ('groups.live.com', 13997),\n",
       " ('majeureliguefootball.wordpress.com', 42436)]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# See what's in the dict\n",
    "list(site_dict.items())[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "_cell_guid": "dcac415c-7145-4159-8ea6-ed30272c1a43",
    "_uuid": "d839d194074efadac75da4dfb5e23e7fa09ac04f",
    "execution": {
     "iopub.execute_input": "2022-04-07T11:43:54.605092Z",
     "iopub.status.busy": "2022-04-07T11:43:54.604571Z",
     "iopub.status.idle": "2022-04-07T11:43:54.610954Z",
     "shell.execute_reply": "2022-04-07T11:43:54.610035Z",
     "shell.execute_reply.started": "2022-04-07T11:43:54.605037Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(82797, 20) (253561, 21)\n"
     ]
    }
   ],
   "source": [
    "# Size of the sets\n",
    "print(test_df.shape, train_df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-04-07T11:43:54.612865Z",
     "iopub.status.busy": "2022-04-07T11:43:54.612542Z",
     "iopub.status.idle": "2022-04-07T11:43:54.675781Z",
     "shell.execute_reply": "2022-04-07T11:43:54.674874Z",
     "shell.execute_reply.started": "2022-04-07T11:43:54.612807Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>site1</th>\n",
       "      <th>time1</th>\n",
       "      <th>site2</th>\n",
       "      <th>time2</th>\n",
       "      <th>site3</th>\n",
       "      <th>time3</th>\n",
       "      <th>site4</th>\n",
       "      <th>time4</th>\n",
       "      <th>site5</th>\n",
       "      <th>time5</th>\n",
       "      <th>...</th>\n",
       "      <th>time6</th>\n",
       "      <th>site7</th>\n",
       "      <th>time7</th>\n",
       "      <th>site8</th>\n",
       "      <th>time8</th>\n",
       "      <th>site9</th>\n",
       "      <th>time9</th>\n",
       "      <th>site10</th>\n",
       "      <th>time10</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>session_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>27554</th>\n",
       "      <td>41475</td>\n",
       "      <td>2013-11-15 07:39:35</td>\n",
       "      <td>6725</td>\n",
       "      <td>2013-11-15 07:39:35</td>\n",
       "      <td>6725</td>\n",
       "      <td>2013-11-15 07:39:36</td>\n",
       "      <td>41475</td>\n",
       "      <td>2013-11-15 07:39:36</td>\n",
       "      <td>41476</td>\n",
       "      <td>2013-11-15 07:39:40</td>\n",
       "      <td>...</td>\n",
       "      <td>2013-11-15 07:39:41</td>\n",
       "      <td>6725</td>\n",
       "      <td>2013-11-15 07:42:50</td>\n",
       "      <td>41475</td>\n",
       "      <td>2013-11-15 07:42:50</td>\n",
       "      <td>41476</td>\n",
       "      <td>2013-11-15 07:42:50</td>\n",
       "      <td>6725</td>\n",
       "      <td>2013-11-15 07:44:25</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>81350</th>\n",
       "      <td>41476</td>\n",
       "      <td>2013-11-15 07:44:25</td>\n",
       "      <td>41475</td>\n",
       "      <td>2013-11-15 07:44:25</td>\n",
       "      <td>41476</td>\n",
       "      <td>2013-11-15 07:57:45</td>\n",
       "      <td>6725</td>\n",
       "      <td>2013-11-15 07:57:45</td>\n",
       "      <td>41475</td>\n",
       "      <td>2013-11-15 07:57:45</td>\n",
       "      <td>...</td>\n",
       "      <td>2013-11-15 07:57:46</td>\n",
       "      <td>41476</td>\n",
       "      <td>2013-11-15 07:57:47</td>\n",
       "      <td>6725</td>\n",
       "      <td>2013-11-15 07:57:49</td>\n",
       "      <td>41475</td>\n",
       "      <td>2013-11-15 07:57:49</td>\n",
       "      <td>41476</td>\n",
       "      <td>2013-11-15 07:57:49</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>234665</th>\n",
       "      <td>4802</td>\n",
       "      <td>2013-11-15 07:52:17</td>\n",
       "      <td>23</td>\n",
       "      <td>2013-11-15 07:52:18</td>\n",
       "      <td>4803</td>\n",
       "      <td>2013-11-15 07:52:19</td>\n",
       "      <td>38</td>\n",
       "      <td>2013-11-15 07:52:19</td>\n",
       "      <td>38</td>\n",
       "      <td>2013-11-15 07:52:20</td>\n",
       "      <td>...</td>\n",
       "      <td>2013-11-15 07:52:20</td>\n",
       "      <td>4804</td>\n",
       "      <td>2013-11-15 07:52:23</td>\n",
       "      <td>21</td>\n",
       "      <td>2013-11-15 07:52:26</td>\n",
       "      <td>23</td>\n",
       "      <td>2013-11-15 07:52:26</td>\n",
       "      <td>22</td>\n",
       "      <td>2013-11-15 07:52:28</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97610</th>\n",
       "      <td>23</td>\n",
       "      <td>2013-11-15 07:52:28</td>\n",
       "      <td>23</td>\n",
       "      <td>2013-11-15 07:52:29</td>\n",
       "      <td>22</td>\n",
       "      <td>2013-11-15 07:52:37</td>\n",
       "      <td>21</td>\n",
       "      <td>2013-11-15 07:52:37</td>\n",
       "      <td>63</td>\n",
       "      <td>2013-11-15 07:55:10</td>\n",
       "      <td>...</td>\n",
       "      <td>2013-11-15 07:55:10</td>\n",
       "      <td>784</td>\n",
       "      <td>2013-11-15 07:55:56</td>\n",
       "      <td>4804</td>\n",
       "      <td>2013-11-15 07:57:50</td>\n",
       "      <td>4804</td>\n",
       "      <td>2013-11-15 08:01:18</td>\n",
       "      <td>784</td>\n",
       "      <td>2013-11-15 08:01:26</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>161358</th>\n",
       "      <td>41476</td>\n",
       "      <td>2013-11-15 07:57:50</td>\n",
       "      <td>41476</td>\n",
       "      <td>2013-11-15 07:57:51</td>\n",
       "      <td>6725</td>\n",
       "      <td>2013-11-15 07:59:34</td>\n",
       "      <td>41475</td>\n",
       "      <td>2013-11-15 07:59:34</td>\n",
       "      <td>41476</td>\n",
       "      <td>2013-11-15 07:59:34</td>\n",
       "      <td>...</td>\n",
       "      <td>NaT</td>\n",
       "      <td>0</td>\n",
       "      <td>NaT</td>\n",
       "      <td>0</td>\n",
       "      <td>NaT</td>\n",
       "      <td>0</td>\n",
       "      <td>NaT</td>\n",
       "      <td>0</td>\n",
       "      <td>NaT</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            site1               time1  site2               time2  site3  \\\n",
       "session_id                                                                \n",
       "27554       41475 2013-11-15 07:39:35   6725 2013-11-15 07:39:35   6725   \n",
       "81350       41476 2013-11-15 07:44:25  41475 2013-11-15 07:44:25  41476   \n",
       "234665       4802 2013-11-15 07:52:17     23 2013-11-15 07:52:18   4803   \n",
       "97610          23 2013-11-15 07:52:28     23 2013-11-15 07:52:29     22   \n",
       "161358      41476 2013-11-15 07:57:50  41476 2013-11-15 07:57:51   6725   \n",
       "\n",
       "                         time3  site4               time4  site5  \\\n",
       "session_id                                                         \n",
       "27554      2013-11-15 07:39:36  41475 2013-11-15 07:39:36  41476   \n",
       "81350      2013-11-15 07:57:45   6725 2013-11-15 07:57:45  41475   \n",
       "234665     2013-11-15 07:52:19     38 2013-11-15 07:52:19     38   \n",
       "97610      2013-11-15 07:52:37     21 2013-11-15 07:52:37     63   \n",
       "161358     2013-11-15 07:59:34  41475 2013-11-15 07:59:34  41476   \n",
       "\n",
       "                         time5  ...                 time6  site7  \\\n",
       "session_id                      ...                                \n",
       "27554      2013-11-15 07:39:40  ...   2013-11-15 07:39:41   6725   \n",
       "81350      2013-11-15 07:57:45  ...   2013-11-15 07:57:46  41476   \n",
       "234665     2013-11-15 07:52:20  ...   2013-11-15 07:52:20   4804   \n",
       "97610      2013-11-15 07:55:10  ...   2013-11-15 07:55:10    784   \n",
       "161358     2013-11-15 07:59:34  ...                   NaT      0   \n",
       "\n",
       "                         time7  site8               time8  site9  \\\n",
       "session_id                                                         \n",
       "27554      2013-11-15 07:42:50  41475 2013-11-15 07:42:50  41476   \n",
       "81350      2013-11-15 07:57:47   6725 2013-11-15 07:57:49  41475   \n",
       "234665     2013-11-15 07:52:23     21 2013-11-15 07:52:26     23   \n",
       "97610      2013-11-15 07:55:56   4804 2013-11-15 07:57:50   4804   \n",
       "161358                     NaT      0                 NaT      0   \n",
       "\n",
       "                         time9 site10              time10 target  \n",
       "session_id                                                        \n",
       "27554      2013-11-15 07:42:50   6725 2013-11-15 07:44:25      0  \n",
       "81350      2013-11-15 07:57:49  41476 2013-11-15 07:57:49      0  \n",
       "234665     2013-11-15 07:52:26     22 2013-11-15 07:52:28      0  \n",
       "97610      2013-11-15 08:01:18    784 2013-11-15 08:01:26      0  \n",
       "161358                     NaT      0                 NaT      0  \n",
       "\n",
       "[5 rows x 21 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# What's inside the train\n",
    "train_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_cell_guid": "2bccda21-944c-4808-83a1-9ca3e1580220",
    "_uuid": "d3c63c91f13663e02c486cb0b545ccadc2dec318"
   },
   "source": [
    "For the very basic model, we will use only the visited websites in the session (we will not take into account timestamp features). \n",
    "\n",
    "*Alice has her favorite sites, and the more often you see these sites in the session, the higher probability that this is an Alice session, and vice versa.*\n",
    "\n",
    "Let us prepare the data, we will take only features `site1, site2, ... , site10` from the whole dataframe. Keep in mind that the missing values are replaced with zero. Here is what the first rows of the dataframe look like:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-04-07T11:43:54.684767Z",
     "iopub.status.busy": "2022-04-07T11:43:54.684447Z",
     "iopub.status.idle": "2022-04-07T11:43:54.757709Z",
     "shell.execute_reply": "2022-04-07T11:43:54.756999Z",
     "shell.execute_reply.started": "2022-04-07T11:43:54.68471Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>site1</th>\n",
       "      <th>site2</th>\n",
       "      <th>site3</th>\n",
       "      <th>site4</th>\n",
       "      <th>site5</th>\n",
       "      <th>site6</th>\n",
       "      <th>site7</th>\n",
       "      <th>site8</th>\n",
       "      <th>site9</th>\n",
       "      <th>site10</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>session_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>27554</th>\n",
       "      <td>41475</td>\n",
       "      <td>6725</td>\n",
       "      <td>6725</td>\n",
       "      <td>41475</td>\n",
       "      <td>41476</td>\n",
       "      <td>41476</td>\n",
       "      <td>6725</td>\n",
       "      <td>41475</td>\n",
       "      <td>41476</td>\n",
       "      <td>6725</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>81350</th>\n",
       "      <td>41476</td>\n",
       "      <td>41475</td>\n",
       "      <td>41476</td>\n",
       "      <td>6725</td>\n",
       "      <td>41475</td>\n",
       "      <td>41476</td>\n",
       "      <td>41476</td>\n",
       "      <td>6725</td>\n",
       "      <td>41475</td>\n",
       "      <td>41476</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>234665</th>\n",
       "      <td>4802</td>\n",
       "      <td>23</td>\n",
       "      <td>4803</td>\n",
       "      <td>38</td>\n",
       "      <td>38</td>\n",
       "      <td>4804</td>\n",
       "      <td>4804</td>\n",
       "      <td>21</td>\n",
       "      <td>23</td>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97610</th>\n",
       "      <td>23</td>\n",
       "      <td>23</td>\n",
       "      <td>22</td>\n",
       "      <td>21</td>\n",
       "      <td>63</td>\n",
       "      <td>66</td>\n",
       "      <td>784</td>\n",
       "      <td>4804</td>\n",
       "      <td>4804</td>\n",
       "      <td>784</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>161358</th>\n",
       "      <td>41476</td>\n",
       "      <td>41476</td>\n",
       "      <td>6725</td>\n",
       "      <td>41475</td>\n",
       "      <td>41476</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            site1  site2  site3  site4  site5  site6  site7  site8  site9  \\\n",
       "session_id                                                                  \n",
       "27554       41475   6725   6725  41475  41476  41476   6725  41475  41476   \n",
       "81350       41476  41475  41476   6725  41475  41476  41476   6725  41475   \n",
       "234665       4802     23   4803     38     38   4804   4804     21     23   \n",
       "97610          23     23     22     21     63     66    784   4804   4804   \n",
       "161358      41476  41476   6725  41475  41476      0      0      0      0   \n",
       "\n",
       "            site10  \n",
       "session_id          \n",
       "27554         6725  \n",
       "81350        41476  \n",
       "234665          22  \n",
       "97610          784  \n",
       "161358           0  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df[sites].head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Since IDs of sites have no meaning (does not matter if a site has an ID of 1 or 100), we need to think about how to encode the meaning of \"this site in a session means higher probablity that it is an Alice session\". \n",
    "\n",
    "We will use a technique called [\"bag of words plus n-gram model\"](https://en.wikipedia.org/wiki/Bag-of-words_model).\n",
    "\n",
    "We will make a \"site-session\" matrix analogous to the term-document matrix.\n",
    "\n",
    "We are not the first, and luckily there is a function CountVectorizer that will implement the above model. Type help(CountVectorizer) to learn about the function. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-04-07T11:54:59.202729Z",
     "iopub.status.busy": "2022-04-07T11:54:59.202402Z",
     "iopub.status.idle": "2022-04-07T11:54:59.207219Z",
     "shell.execute_reply": "2022-04-07T11:54:59.206225Z",
     "shell.execute_reply.started": "2022-04-07T11:54:59.20268Z"
    }
   },
   "source": [
    "We will now initialize a \"cv\" (CountVectorizer's) instance which we need to train. \n",
    "\n",
    "We will use the following parameters:\n",
    "\n",
    "_ngram range=(1, 3)_ - here we decide that we will use \n",
    "1) the name of the site, \n",
    "2) two consecutive site names, and \n",
    "3) three consecutive site names as features. \n",
    "E.g. \"google.com\" or \"google.com vk.com\" or \"google.com vk.com groups.live.com\". \n",
    "\n",
    "CountVectorizer will create a large dictionary of 1, 2, and 3-gram strings of sites represented by their numerical IDs. However, this dictionary will be so so large that we may run into trouble with memory or we will just be inefficent chasing phantom combinations.\n",
    "\n",
    "We will thus limit the dictionary to 50K of the most frequent n-grams:\n",
    "\n",
    "_max features=50000_\n",
    "\n",
    "Here is our empty instance:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "cv = CountVectorizer(ngram_range=(1, 3), max_features=50000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "CountVectorizer accepts \"document strings\", so let's prepare a string of our \"documents\" (i.e. sites), divided by space. Since the string will be huge, we will write this string in a text file using pandas:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "_cell_guid": "9e35cd8d-5cdc-4eef-a7d6-1e6fc12edcfd",
    "_uuid": "1249386daf13151565cd4a33cdfa3ca0bcb59505",
    "execution": {
     "iopub.execute_input": "2022-04-07T11:51:51.351437Z",
     "iopub.status.busy": "2022-04-07T11:51:51.350353Z",
     "iopub.status.idle": "2022-04-07T11:51:57.87576Z",
     "shell.execute_reply": "2022-04-07T11:51:57.874891Z",
     "shell.execute_reply.started": "2022-04-07T11:51:51.351376Z"
    }
   },
   "outputs": [],
   "source": [
    "train_df[sites].fillna(0).to_csv('train_sessions_text.txt', \n",
    "                                 sep=' ', index=None, header=None)\n",
    "test_df[sites].fillna(0).to_csv('test_sessions_text.txt', \n",
    "                                sep=' ', index=None, header=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true,
    "execution": {
     "iopub.execute_input": "2022-04-07T11:44:43.700127Z",
     "iopub.status.busy": "2022-04-07T11:44:43.699539Z",
     "iopub.status.idle": "2022-04-07T11:44:43.750425Z",
     "shell.execute_reply": "2022-04-07T11:44:43.749546Z",
     "shell.execute_reply.started": "2022-04-07T11:44:43.700057Z"
    },
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "source": [
    "Before we start using CountVectorizer, let's see how it works on a sub-set of 5 sessions:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-04-07T11:52:42.67481Z",
     "iopub.status.busy": "2022-04-07T11:52:42.674326Z",
     "iopub.status.idle": "2022-04-07T11:52:42.690575Z",
     "shell.execute_reply": "2022-04-07T11:52:42.689795Z",
     "shell.execute_reply.started": "2022-04-07T11:52:42.674754Z"
    }
   },
   "outputs": [],
   "source": [
    "five_sess = pd.read_csv('train_sessions_text.txt', sep=' ', nrows=5, header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-04-07T11:52:45.022484Z",
     "iopub.status.busy": "2022-04-07T11:52:45.022015Z",
     "iopub.status.idle": "2022-04-07T11:52:45.046309Z",
     "shell.execute_reply": "2022-04-07T11:52:45.045522Z",
     "shell.execute_reply.started": "2022-04-07T11:52:45.022442Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>41475</td>\n",
       "      <td>6725</td>\n",
       "      <td>6725</td>\n",
       "      <td>41475</td>\n",
       "      <td>41476</td>\n",
       "      <td>41476</td>\n",
       "      <td>6725</td>\n",
       "      <td>41475</td>\n",
       "      <td>41476</td>\n",
       "      <td>6725</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>41476</td>\n",
       "      <td>41475</td>\n",
       "      <td>41476</td>\n",
       "      <td>6725</td>\n",
       "      <td>41475</td>\n",
       "      <td>41476</td>\n",
       "      <td>41476</td>\n",
       "      <td>6725</td>\n",
       "      <td>41475</td>\n",
       "      <td>41476</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4802</td>\n",
       "      <td>23</td>\n",
       "      <td>4803</td>\n",
       "      <td>38</td>\n",
       "      <td>38</td>\n",
       "      <td>4804</td>\n",
       "      <td>4804</td>\n",
       "      <td>21</td>\n",
       "      <td>23</td>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>23</td>\n",
       "      <td>23</td>\n",
       "      <td>22</td>\n",
       "      <td>21</td>\n",
       "      <td>63</td>\n",
       "      <td>66</td>\n",
       "      <td>784</td>\n",
       "      <td>4804</td>\n",
       "      <td>4804</td>\n",
       "      <td>784</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>41476</td>\n",
       "      <td>41476</td>\n",
       "      <td>6725</td>\n",
       "      <td>41475</td>\n",
       "      <td>41476</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       0      1      2      3      4      5      6      7      8      9\n",
       "0  41475   6725   6725  41475  41476  41476   6725  41475  41476   6725\n",
       "1  41476  41475  41476   6725  41475  41476  41476   6725  41475  41476\n",
       "2   4802     23   4803     38     38   4804   4804     21     23     22\n",
       "3     23     23     22     21     63     66    784   4804   4804    784\n",
       "4  41476  41476   6725  41475  41476      0      0      0      0      0"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "five_sess"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First of all, let's make an inverse dictionary which gives us a site name for ID.\n",
    "The direct dictionary came to us like this:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-04-07T11:53:45.030222Z",
     "iopub.status.busy": "2022-04-07T11:53:45.029856Z",
     "iopub.status.idle": "2022-04-07T11:53:45.036002Z",
     "shell.execute_reply": "2022-04-07T11:53:45.034005Z",
     "shell.execute_reply.started": "2022-04-07T11:53:45.030169Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('www.abmecatronique.com', 25075),\n",
       " ('groups.live.com', 13997),\n",
       " ('majeureliguefootball.wordpress.com', 42436)]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(site_dict.items())[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-04-07T11:53:42.095623Z",
     "iopub.status.busy": "2022-04-07T11:53:42.095178Z",
     "iopub.status.idle": "2022-04-07T11:53:42.126674Z",
     "shell.execute_reply": "2022-04-07T11:53:42.125463Z",
     "shell.execute_reply.started": "2022-04-07T11:53:42.095576Z"
    }
   },
   "outputs": [],
   "source": [
    "# The inverse dictionary:\n",
    "\n",
    "new_dict = {}\n",
    "for key in site_dict:\n",
    "    new_dict[site_dict[key]] = key"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-04-07T11:53:48.926092Z",
     "iopub.status.busy": "2022-04-07T11:53:48.925589Z",
     "iopub.status.idle": "2022-04-07T11:53:48.932805Z",
     "shell.execute_reply": "2022-04-07T11:53:48.932039Z",
     "shell.execute_reply.started": "2022-04-07T11:53:48.926043Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(25075, 'www.abmecatronique.com'),\n",
       " (13997, 'groups.live.com'),\n",
       " (42436, 'majeureliguefootball.wordpress.com')]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Let's check what's in it:\n",
    "\n",
    "list(new_dict.items())[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-04-07T11:54:04.095135Z",
     "iopub.status.busy": "2022-04-07T11:54:04.094408Z",
     "iopub.status.idle": "2022-04-07T11:54:04.102594Z",
     "shell.execute_reply": "2022-04-07T11:54:04.101902Z",
     "shell.execute_reply.started": "2022-04-07T11:54:04.095075Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "41475 6725 6725 41475 41476 41476 6725 41475 41476 6725\n",
      "41476 41475 41476 6725 41475 41476 41476 6725 41475 41476\n",
      "4802 23 4803 38 38 4804 4804 21 23 22\n",
      "23 23 22 21 63 66 784 4804 4804 784\n",
      "41476 41476 6725 41475 41476\n",
      "\n",
      "security.debian.org www-fourier.ujf-grenoble.fr www-fourier.ujf-grenoble.fr security.debian.org backports.debian.org backports.debian.org www-fourier.ujf-grenoble.fr security.debian.org backports.debian.org www-fourier.ujf-grenoble.fr\n",
      "backports.debian.org security.debian.org backports.debian.org www-fourier.ujf-grenoble.fr security.debian.org backports.debian.org backports.debian.org www-fourier.ujf-grenoble.fr security.debian.org backports.debian.org\n",
      "cnfg.toolbarservices.com www.google.com utils.delta-search.com ajax.googleapis.com ajax.googleapis.com img.babylon.com img.babylon.com www.google.fr www.google.com apis.google.com\n",
      "www.google.com www.google.com apis.google.com www.google.fr ieonline.microsoft.com go.microsoft.com javadl-esd-secure.oracle.com img.babylon.com img.babylon.com javadl-esd-secure.oracle.com\n",
      "backports.debian.org backports.debian.org www-fourier.ujf-grenoble.fr security.debian.org backports.debian.org\n"
     ]
    }
   ],
   "source": [
    "# Let's see site names in the five first sessions:\n",
    "\n",
    "list_sites = []\n",
    "for row in five_sess.values:\n",
    "    row_sites = ' '.join([str(i) for i in row if i!=0])\n",
    "    print(row_sites)\n",
    "    list_sites.append(row_sites) \n",
    "\n",
    "print()\n",
    "    \n",
    "list_sites_names = []\n",
    "for row in five_sess.values:\n",
    "    row_sites = ' '.join([new_dict[i] for i in row if i!=0])\n",
    "    print(row_sites)\n",
    "    list_sites_names.append(row_sites)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here is what the fit and transform method -- i.e. learn the dictionary and make the matrix -- produces in our \"cv\":\n",
    "a sparse matrix. Why sparse? Because nrows * dict_size = usually will not fit in memory \n",
    "(obviously, our 5 sessions will fit in memory so that we can look at them)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-04-07T11:55:04.927195Z",
     "iopub.status.busy": "2022-04-07T11:55:04.926495Z",
     "iopub.status.idle": "2022-04-07T11:55:04.945337Z",
     "shell.execute_reply": "2022-04-07T11:55:04.944114Z",
     "shell.execute_reply.started": "2022-04-07T11:55:04.927126Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<5x60 sparse matrix of type '<class 'numpy.int64'>'\n",
       "\twith 88 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "see_vect = cv.fit_transform(list_sites)\n",
    "\n",
    "# Matrix dimensions: 5 sessions of 60 elements\n",
    "see_vect"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-04-07T11:56:11.894299Z",
     "iopub.status.busy": "2022-04-07T11:56:11.893888Z",
     "iopub.status.idle": "2022-04-07T11:56:11.901546Z",
     "shell.execute_reply": "2022-04-07T11:56:11.900821Z",
     "shell.execute_reply.started": "2022-04-07T11:56:11.894234Z"
    },
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['21', '21 23', '21 23 22', '21 63', '21 63 66', '22']"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Here is the dictionary of sites, 1 to 3-gram words. First 6 elements in the matrix:\n",
    "\n",
    "cv.get_feature_names()[:6]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 www.google.fr \n",
      "2 www.google.fr www.google.com \n",
      "3 www.google.fr www.google.com apis.google.com \n",
      "4 www.google.fr ieonline.microsoft.com \n",
      "5 www.google.fr ieonline.microsoft.com go.microsoft.com \n",
      "6 apis.google.com \n",
      "7 apis.google.com www.google.fr \n",
      "8 apis.google.com www.google.fr ieonline.microsoft.com \n",
      "9 www.google.com \n",
      "10 www.google.com apis.google.com \n",
      "11 www.google.com apis.google.com www.google.fr \n",
      "12 www.google.com www.google.com \n",
      "13 www.google.com www.google.com apis.google.com \n",
      "14 www.google.com utils.delta-search.com \n",
      "15 www.google.com utils.delta-search.com ajax.googleapis.com \n",
      "16 ajax.googleapis.com \n",
      "17 ajax.googleapis.com ajax.googleapis.com \n",
      "18 ajax.googleapis.com ajax.googleapis.com img.babylon.com \n",
      "19 ajax.googleapis.com img.babylon.com \n",
      "20 ajax.googleapis.com img.babylon.com img.babylon.com \n",
      "21 security.debian.org \n"
     ]
    }
   ],
   "source": [
    "# A version with the site names. Note that security.debian.org has ID of 21.\n",
    "\n",
    "for i, string in enumerate(cv.get_feature_names()):\n",
    "    if i < 21:\n",
    "        print (i+1, end=\" \")\n",
    "        for num in string.split():\n",
    "            print(new_dict[int(num)], end=\" \")\n",
    "        print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-04-07T11:56:36.148451Z",
     "iopub.status.busy": "2022-04-07T11:56:36.147711Z",
     "iopub.status.idle": "2022-04-07T11:56:36.155891Z",
     "shell.execute_reply": "2022-04-07T11:56:36.155119Z",
     "shell.execute_reply.started": "2022-04-07T11:56:36.148398Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2,\n",
       "        1, 1, 1, 1, 3, 0, 0, 1, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 2, 1, 1, 0, 0, 0],\n",
       "       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3,\n",
       "        1, 1, 0, 0, 5, 1, 1, 1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0],\n",
       "       [1, 1, 1, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
       "       [1, 0, 0, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0,\n",
       "        1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 1, 1],\n",
       "       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,\n",
       "        0, 0, 0, 0, 3, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0]], dtype=int64)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Here is the session-site matrix, toarrray() helps us to see a sparse matrix since it is not large.\n",
    "\n",
    "see_vect.toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'security.debian.org www-fourier.ujf-grenoble.fr www-fourier.ujf-grenoble.fr security.debian.org backports.debian.org backports.debian.org www-fourier.ujf-grenoble.fr security.debian.org backports.debian.org www-fourier.ujf-grenoble.fr'"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# The first session (row in the matrix) is this:\n",
    "\n",
    "list_sites_names[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's see how the first site of the first session, \"security.debian.org\", is recorded in the session-site matrix. \n",
    "Its ID is 21 which corresponds to 3. It is the number of times this site was seen in the first session.\n",
    "Indeed, count for yourself in the cell above. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 0\n",
      "2 0\n",
      "3 0\n",
      "4 0\n",
      "5 0\n",
      "6 0\n",
      "7 0\n",
      "8 0\n",
      "9 0\n",
      "10 0\n",
      "11 0\n",
      "12 0\n",
      "13 0\n",
      "14 0\n",
      "15 0\n",
      "16 0\n",
      "17 0\n",
      "18 0\n",
      "19 0\n",
      "20 0\n",
      "21 3\n"
     ]
    }
   ],
   "source": [
    "\n",
    "first_row = see_vect.toarray()[0]\n",
    "\n",
    "for one, two in zip(range(60),first_row):\n",
    "    if one < 21:\n",
    "        print (one+1, two)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's go back to all sessions."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_uuid": "df2d4246d8542eae60adbbf3e2d599c80d5044a4"
   },
   "source": [
    "Fit `CountVectorizer` to train data and transform the train and test data with it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-04-07T11:59:16.803945Z",
     "iopub.status.busy": "2022-04-07T11:59:16.802761Z",
     "iopub.status.idle": "2022-04-07T11:59:37.310285Z",
     "shell.execute_reply": "2022-04-07T11:59:37.309436Z",
     "shell.execute_reply.started": "2022-04-07T11:59:16.803848Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(253561, 50000) (82797, 50000)\n",
      "Wall time: 21.2 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "with open('train_sessions_text.txt') as inp_train_file:\n",
    "    X_train = cv.fit_transform(inp_train_file)\n",
    "with open('test_sessions_text.txt') as inp_test_file:\n",
    "    X_test = cv.transform(inp_test_file)\n",
    "\n",
    "print(X_train.shape, X_test.shape)\n",
    "\n",
    "# Note very big dimensions of matrices: 253561 * 50000 = 12678050000 elements in train! Only sparse matrices can take it."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_cell_guid": "f5424973-a5b0-40be-add7-bebb788adcfa",
    "_uuid": "e6d8be7bcbbf3aea078cada2195f43b689c181af"
   },
   "source": [
    "### Training the first model\n",
    "\n",
    "So, we have an algorithm and data for it. Let us build our first model, using [logistic regression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) implementation from ` Sklearn` with default parameters. We will use the first 90% of the data for training (the training data set is sorted by time) and the remaining 10% for validation. Let's write a simple function that returns the quality of the model and then train our first classifier:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "_cell_guid": "bbcea30d-1e1e-4f14-bbd7-49930d165d7d",
    "_uuid": "0e5a5f750b7406290c52a00614e0ecf8a44e00bf",
    "execution": {
     "iopub.execute_input": "2022-04-07T12:06:54.137479Z",
     "iopub.status.busy": "2022-04-07T12:06:54.137046Z",
     "iopub.status.idle": "2022-04-07T12:06:54.144817Z",
     "shell.execute_reply": "2022-04-07T12:06:54.143131Z",
     "shell.execute_reply.started": "2022-04-07T12:06:54.13741Z"
    }
   },
   "outputs": [],
   "source": [
    "def get_auc_lr_valid(X, y, C=1.0, seed=17, ratio = 0.9):\n",
    "    \n",
    "    # Split the data into the training and validation sets\n",
    "    idx = int(round(X.shape[0] * ratio))\n",
    "    \n",
    "    # Classifier training\n",
    "    lr = LogisticRegression(C=C, random_state=seed, solver='lbfgs', max_iter=500).fit(X[:idx, :], y[:idx])\n",
    "    \n",
    "    # Prediction for validation set\n",
    "    y_pred = lr.predict_proba(X[idx:, :])[:, 1]\n",
    "    \n",
    "    # Calculate the quality\n",
    "    score = roc_auc_score(y[idx:], y_pred)\n",
    "    \n",
    "    return score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "_cell_guid": "9f3b378b-a11a-44c9-a7d1-0da152a42aae",
    "_uuid": "8f7b6e88162f02f298cd2edab0aa336d8f377607",
    "execution": {
     "iopub.execute_input": "2022-04-07T11:43:54.677941Z",
     "iopub.status.busy": "2022-04-07T11:43:54.677361Z",
     "iopub.status.idle": "2022-04-07T11:43:54.683147Z",
     "shell.execute_reply": "2022-04-07T11:43:54.682077Z",
     "shell.execute_reply.started": "2022-04-07T11:43:54.67774Z"
    }
   },
   "outputs": [],
   "source": [
    "# Our target variable\n",
    "y_train = train_df['target'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "_cell_guid": "c966e06b-024f-425d-8a17-347bace41214",
    "_uuid": "6b0670f406f20a741823e55f110738373af7eb51",
    "execution": {
     "iopub.execute_input": "2022-04-07T12:07:22.102357Z",
     "iopub.status.busy": "2022-04-07T12:07:22.101573Z",
     "iopub.status.idle": "2022-04-07T12:07:43.138674Z",
     "shell.execute_reply": "2022-04-07T12:07:43.137487Z",
     "shell.execute_reply.started": "2022-04-07T12:07:22.102302Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.9122581928805027\n",
      "Wall time: 21 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# Calculate metric on the validation set. 90% of train data for training. 10% for validation.\n",
    "\n",
    "print(get_auc_lr_valid(X_train, y_train))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-04-07T12:08:42.560724Z",
     "iopub.status.busy": "2022-04-07T12:08:42.560054Z",
     "iopub.status.idle": "2022-04-07T12:08:56.956133Z",
     "shell.execute_reply": "2022-04-07T12:08:56.955086Z",
     "shell.execute_reply.started": "2022-04-07T12:08:42.560645Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8225182301089249"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 50% of train data for training:\n",
    "\n",
    "get_auc_lr_valid(X_train, y_train, ratio=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-03-31T10:22:18.267007Z",
     "iopub.status.busy": "2022-03-31T10:22:18.26632Z",
     "iopub.status.idle": "2022-03-31T10:22:18.271725Z",
     "shell.execute_reply": "2022-03-31T10:22:18.270772Z",
     "shell.execute_reply.started": "2022-03-31T10:22:18.266941Z"
    }
   },
   "outputs": [],
   "source": [
    "# Wow! Big data rules in this task: .82 -> .91"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "_cell_guid": "4e7590d4-096f-4282-b0c6-702b5b5e8917",
    "_uuid": "d6457c4112f39c8d5721fba91feb1a4e74d3ed98",
    "execution": {
     "iopub.execute_input": "2022-04-07T12:12:08.196316Z",
     "iopub.status.busy": "2022-04-07T12:12:08.195957Z",
     "iopub.status.idle": "2022-04-07T12:12:08.201861Z",
     "shell.execute_reply": "2022-04-07T12:12:08.200826Z",
     "shell.execute_reply.started": "2022-04-07T12:12:08.196265Z"
    }
   },
   "outputs": [],
   "source": [
    "# Function for writing predictions to a file\n",
    "def write_to_submission_file(predicted_labels, out_file,\n",
    "                             target='target', index_label=\"session_id\"):\n",
    "    predicted_df = pd.DataFrame(predicted_labels,\n",
    "                                index = range(1, predicted_labels.shape[0] + 1),\n",
    "                                columns=[target])\n",
    "    predicted_df.to_csv(out_file, index_label=index_label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "_cell_guid": "43108119-ddee-4eaf-830b-91adec26009b",
    "_uuid": "2c055eb78026fc951216ed7dbdbedcfa47b61898",
    "execution": {
     "iopub.execute_input": "2022-04-07T12:12:42.559522Z",
     "iopub.status.busy": "2022-04-07T12:12:42.559052Z",
     "iopub.status.idle": "2022-04-07T12:13:07.978637Z",
     "shell.execute_reply": "2022-04-07T12:13:07.977572Z",
     "shell.execute_reply.started": "2022-04-07T12:12:42.55948Z"
    }
   },
   "outputs": [],
   "source": [
    "# Train the model on the whole training data set\n",
    "# Use random_state=17 for reproducibility\n",
    "# Parameter C=1 by default, but here we set it explicitly\n",
    "\n",
    "lr = LogisticRegression(C=1.0, random_state=17, solver='lbfgs', max_iter=500).fit(X_train, y_train)\n",
    "\n",
    "# Make a prediction for test data set\n",
    "y_test = lr.predict_proba(X_test)[:, 1]\n",
    "\n",
    "# Write it to the file which could be submitted\n",
    "write_to_submission_file(y_test, 'baseline_1.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The first model demonstrated the quality of 0.9235 on the validation set. Let's take it as the first baseline and starting point. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}