In [3]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
import random

np.random.seed(1984)

In [4]:
data_train = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

In [5]:
data_train.head()

Unnamed: 0,ISU,ST_YEAR,SEMESTER,DISC_ID,TYPE_NAME,DEBT
0,17623928044460629977,2020,1,10502311854018326223,Зачет,0
1,17623928044460629977,2020,1,1601392918367593206,Зачет,0
2,17623928044460629977,2020,1,9559803959325174929,Зачет,0
3,17623928044460629977,2020,1,8955667882044263414,Зачет,0
4,17623928044460629977,2020,1,17741967398854095262,Экзамен,0


In [6]:
all_st_df = []

for st_year in data_train['ST_YEAR'].unique():
    for semester in data_train['SEMESTER'].unique():
        data_temp = (
            data_train
            .drop('DISC_ID', axis=1)
            [(data_train['ST_YEAR'] < st_year) & (data_train['SEMESTER'] < semester)]
            .groupby(['ISU', 'TYPE_NAME'], as_index=False)
            .agg(DEBT_MEAN=('DEBT', 'mean'), DEBT_SUM=('DEBT', 'sum'), DEBT_COUNT=('DEBT', 'count')
            )
        )
        data_temp['ST_YEAR'] = st_year
        data_temp['SEMESTER'] = semester
        
        all_st_df.append(data_temp)

In [7]:
all_st_df = pd.concat(all_st_df)

In [8]:
all_disc_df = []

for st_year in data_train['ST_YEAR'].unique():
    for semester in data_train['SEMESTER'].unique():
        data_temp = (
            data_train
            .drop('ISU', axis=1)
            [(data_train['ST_YEAR'] < st_year) & (data_train['SEMESTER'] < semester)]
            .groupby(['DISC_ID', 'TYPE_NAME'], as_index=False)
            .agg(DISC_DEBT_MEAN=('DEBT', 'mean'), DISC_DEBT_SUM=('DEBT', 'sum'), DISC_DEBT_COUNT=('DEBT', 'count')
            )
        )
        data_temp['ST_YEAR'] = st_year
        data_temp['SEMESTER'] = semester
        
        all_disc_df.append(data_temp)

In [9]:
all_disc_df = pd.concat(all_disc_df)

In [10]:
data_train = data_train.merge(all_st_df, on=['ISU', 'ST_YEAR', 'SEMESTER', 'TYPE_NAME'], how='left')

In [11]:
data_train = data_train.merge(all_disc_df, on=['DISC_ID', 'ST_YEAR', 'SEMESTER', 'TYPE_NAME'], how='left')

In [12]:
data_train = data_train.fillna(0)

In [13]:
data_train['ST_YEAR'].value_counts()

2020    87558
2019    60631
2018    30710
Name: ST_YEAR, dtype: int64

In [14]:
data_train = data_train.join(pd.get_dummies(data_train['TYPE_NAME'], prefix='TYPE_NAME'))

In [15]:
cols_to_drop = ['ISU', 'DISC_ID', 'TYPE_NAME']

In [16]:
train = data_train[data_train['ST_YEAR'].isin([2018, 2019])].drop(cols_to_drop, axis=1)
test = data_train[data_train['ST_YEAR'] == 2020].drop(cols_to_drop, axis=1)

In [17]:
X_train = train.drop(['DEBT'], axis=1)
y_train = train['DEBT']

X_test = test.drop(['DEBT'], axis=1)
y_test = test['DEBT']

In [18]:
model = LogisticRegression(class_weight='balanced')

In [19]:
model.fit(X_train, y_train)
preds = model.predict(X_test)

In [20]:
print('f1 score', f1_score(y_test, preds))
print('accuracy score', accuracy_score(y_test, preds))
print('precision score', precision_score(y_test, preds))
print('recall score', recall_score(y_test, preds))

f1 score 0.12633139343362249
accuracy score 0.6365152241942483
precision score 0.07035191243464702
recall score 0.6183821553345875


In [21]:
# fit on all data
X = data_train.drop(cols_to_drop + ['DEBT'], axis=1)
y = data_train['DEBT']

model.fit(X, y);

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
# collect data for test

In [22]:
all_st_df_test = []

for st_year in data_train['ST_YEAR'].unique():
    for semester in data_train['SEMESTER'].unique():
        data_temp = (
            data_train
            .drop('DISC_ID', axis=1)
            [(data_train['ST_YEAR'] <= st_year) & (data_train['SEMESTER'] <= semester)]
            .groupby(['ISU', 'TYPE_NAME'], as_index=False)
            .agg(DEBT_MEAN=('DEBT', 'mean'), DEBT_SUM=('DEBT', 'sum'), DEBT_COUNT=('DEBT', 'count')
            )
        )
        data_temp['ST_YEAR'] = st_year + 1
        data_temp['SEMESTER'] = semester + 1
        
        all_st_df_test.append(data_temp)
        
all_disc_df_test = []

for st_year in data_train['ST_YEAR'].unique():
    for semester in data_train['SEMESTER'].unique():
        data_temp = (
            data_train
            .drop('ISU', axis=1)
            [(data_train['ST_YEAR'] <= st_year) & (data_train['SEMESTER'] <= semester)]
            .groupby(['DISC_ID', 'TYPE_NAME'], as_index=False)
            .agg(DISC_DEBT_MEAN=('DEBT', 'mean'), DISC_DEBT_SUM=('DEBT', 'sum'), DISC_DEBT_COUNT=('DEBT', 'count')
            )
        )
        data_temp['ST_YEAR'] = st_year + 1
        data_temp['SEMESTER'] = semester + 1
        
        all_disc_df_test.append(data_temp)
        
        
all_st_df_test = pd.concat(all_st_df_test)
all_disc_df_test = pd.concat(all_disc_df_test)

In [23]:
data_test = data_test.merge(all_st_df_test, on=['ISU', 'SEMESTER', 'ST_YEAR', 'TYPE_NAME'], how='left')
data_test = data_test.merge(all_disc_df_test, on=['DISC_ID', 'SEMESTER', 'ST_YEAR', 'TYPE_NAME'], how='left')

In [24]:
data_test = data_test.fillna(0)

In [25]:
data_test = data_test.join(pd.get_dummies(data_test['TYPE_NAME'], prefix='TYPE_NAME'))

In [26]:
cols_to_drop_test = ['ISU', 'DISC_ID', 'TYPE_NAME']

In [27]:
test_features = data_test.drop(cols_to_drop_test, axis=1)

In [28]:
final_preds = model.predict(test_features)

In [29]:
data_test['DEBT'] = final_preds

In [30]:
data_test['ID'] = data_test[['ISU', 'ST_YEAR', 'DISC_ID', 'SEMESTER', 'TYPE_NAME']].apply(
        lambda x: f'ISU:{x[0]} | ST_YEAR:{x[1]} | DISC_ID:{x[2]} | SEMESTER:{x[3]} | TYPE_NAME:{x[4]}', axis =1)

In [31]:
res = data_test[['ID', 'DEBT']]

In [33]:
res.to_csv('baseline_preds.csv', index=False)

In [34]:
res

Unnamed: 0,ID,DEBT
0,ISU:22160896830459167 | ST_YEAR:2021 | DISC_ID...,1
1,ISU:22160896830459167 | ST_YEAR:2021 | DISC_ID...,0
2,ISU:22160896830459167 | ST_YEAR:2021 | DISC_ID...,0
3,ISU:22160896830459167 | ST_YEAR:2021 | DISC_ID...,0
4,ISU:22160896830459167 | ST_YEAR:2021 | DISC_ID...,0
...,...,...
30908,ISU:18441846113897984349 | ST_YEAR:2021 | DISC...,0
30909,ISU:18441846113897984349 | ST_YEAR:2021 | DISC...,0
30910,ISU:18441846113897984349 | ST_YEAR:2021 | DISC...,0
30911,ISU:18441846113897984349 | ST_YEAR:2021 | DISC...,0
