import pandas as pd
import numpy as np
import polars as pl
from collections import Counter, defaultdict
import re
from scipy. stats import skew, kurtosis
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn. model_selection import StratifiedKFold
import random
seed= 2024
np. random. seed( seed)
random. seed( seed)
import warnings
warnings. filterwarnings( 'ignore' )
def get_Essays ( df) :
USER_ID = df[ "id" ] . iloc[ 0 ]
textInputDf = df[ [ 'activity' , 'cursor_position' , 'text_change' ] ]
currTextInput = textInputDf[ textInputDf. activity != 'Nonproduction' ]
essayText = ""
for Input in currTextInput. values:
if Input[ 0 ] == 'Replace' :
replaceTxt = Input[ 2 ] . split( ' => ' )
essayText = essayText[ : Input[ 1 ] - len ( replaceTxt[ 1 ] ) ] + replaceTxt[ 1 ] + essayText[ Input[ 1 ] - len ( replaceTxt[ 1 ] ) + len ( replaceTxt[ 0 ] ) : ]
continue
if Input[ 0 ] == 'Paste' :
essayText = essayText[ : Input[ 1 ] - len ( Input[ 2 ] ) ] + Input[ 2 ] + essayText[ Input[ 1 ] - len ( Input[ 2 ] ) : ]
continue
if Input[ 0 ] == 'Remove/Cut' :
essayText = essayText[ : Input[ 1 ] ] + essayText[ Input[ 1 ] + len ( Input[ 2 ] ) : ]
continue
if "M" in Input[ 0 ] :
croppedTxt = Input[ 0 ] [ 10 : ]
splitTxt = croppedTxt. split( ' To ' )
valueArr = [ item. split( ', ' ) for item in splitTxt]
moveData = ( int ( valueArr[ 0 ] [ 0 ] [ 1 : ] ) ,
int ( valueArr[ 0 ] [ 1 ] [ : - 1 ] ) ,
int ( valueArr[ 1 ] [ 0 ] [ 1 : ] ) ,
int ( valueArr[ 1 ] [ 1 ] [ : - 1 ] ) )
if moveData[ 0 ] != moveData[ 2 ] :
if moveData[ 0 ] < moveData[ 2 ] :
essayText = essayText[ : moveData[ 0 ] ] + essayText[ moveData[ 1 ] : moveData[ 3 ] ] + \
essayText[ moveData[ 0 ] : moveData[ 1 ] ] + essayText[ moveData[ 3 ] : ]
else :
essayText = essayText[ : moveData[ 2 ] ] + essayText[ moveData[ 0 ] : moveData[ 1 ] ] + \
essayText[ moveData[ 2 ] : moveData[ 0 ] ] + essayText[ moveData[ 1 ] : ]
continue
essayText = essayText[ : Input[ 1 ] - len ( Input[ 2 ] ) ] + Input[ 2 ] + essayText[ Input[ 1 ] - len ( Input[ 2 ] ) : ]
return USER_ID, essayText
AGGREGATIONS = [ 'count' , 'min' , 'max' , 'first' , 'last' , 'median' , 'sum' , 'std' ]
def word_feats ( df) :
essay_df = df
essay_df[ 'word' ] = essay_df[ 'essay' ] . apply ( lambda x: re. split( ' |\\n|\\.|\\?|\\!|\\,' , x) )
essay_df = essay_df. explode( 'word' )
essay_df[ 'word_len' ] = essay_df[ 'word' ] . apply ( lambda x: len ( x) )
word_df = essay_df[ essay_df[ 'word_len' ] != 0 ]
word_agg_df = word_df[ [ 'id' , 'word_len' ] ] . groupby( [ 'id' ] ) . agg( AGGREGATIONS)
word_agg_df. columns = [ '_' . join( x) for x in word_agg_df. columns]
word_agg_df[ 'id' ] = word_agg_df. index
word_agg_df = word_agg_df. reset_index( drop= True )
return word_agg_df
def sent_feats ( df) :
essay_df = df
essay_df[ 'sent' ] = essay_df[ 'essay' ] . apply ( lambda x: re. split( '\\.|\\?|\\!' , x) )
essay_df = essay_df. explode( 'sent' )
essay_df[ 'sent' ] = essay_df[ 'sent' ] . apply ( lambda x: x. replace( '\n' , '' ) . strip( ) )
essay_df[ 'sent_len' ] = essay_df[ 'sent' ] . apply ( lambda x: len ( x) )
essay_df[ 'sent_word_count' ] = essay_df[ 'sent' ] . apply ( lambda x: len ( x. split( ' ' ) ) )
df = essay_df[ essay_df. sent_len!= 0 ] . reset_index( drop= True )
sent_agg_df = pd. concat(
[ df[ [ 'id' , 'sent_len' ] ] . groupby( [ 'id' ] ) . agg( AGGREGATIONS) , df[ [ 'id' , 'sent_word_count' ] ] . groupby( [ 'id' ] ) . agg( AGGREGATIONS) ] , axis= 1
)
sent_agg_df. columns = [ '_' . join( x) for x in sent_agg_df. columns]
sent_agg_df[ 'id' ] = sent_agg_df. index
sent_agg_df = sent_agg_df. reset_index( drop= True )
sent_agg_df. drop( columns= [ "sent_word_count_count" ] , inplace= True )
sent_agg_df = sent_agg_df. rename( columns= { "sent_len_count" : "sent_count" } )
return sent_agg_df
def parag_feats ( df) :
essay_df = df
essay_df[ 'paragraph' ] = essay_df[ 'essay' ] . apply ( lambda x: x. split( '\n' ) )
essay_df = essay_df. explode( 'paragraph' )
essay_df[ 'paragraph_len' ] = essay_df[ 'paragraph' ] . apply ( lambda x: len ( x) )
essay_df[ 'paragraph_word_count' ] = essay_df[ 'paragraph' ] . apply ( lambda x: len ( x. split( ' ' ) ) )
essay_df[ 'paragraph_sent_count' ] = essay_df[ 'paragraph' ] . apply ( lambda x: len ( re. split( '\\.|\\?|\\!' , x) ) )
df = essay_df[ essay_df. paragraph_len> 2 ] . reset_index( drop= True )
paragraph_agg_df = pd. concat(
[ df[ [ 'id' , 'paragraph_len' ] ] . groupby( [ 'id' ] ) . agg( AGGREGATIONS) ,
df[ [ 'id' , 'paragraph_word_count' ] ] . groupby( [ 'id' ] ) . agg( AGGREGATIONS) ,
df[ [ 'id' , 'paragraph_sent_count' ] ] . groupby( [ 'id' ] ) . agg( AGGREGATIONS)
] , axis= 1
)
paragraph_agg_df. columns = [ '_' . join( x) for x in paragraph_agg_df. columns]
paragraph_agg_df[ 'id' ] = paragraph_agg_df. index
paragraph_agg_df = paragraph_agg_df. reset_index( drop= True )
paragraph_agg_df. drop( columns= [ "paragraph_word_count_count" , "paragraph_sent_count_count" ] , inplace= True )
paragraph_agg_df = paragraph_agg_df. rename( columns= { "paragraph_len_count" : "paragraph_count" } )
return paragraph_agg_df
def ARI ( txt) :
characters= len ( txt)
words= len ( re. split( ' |\\n|\\.|\\?|\\!|\,' , txt) )
sentence= len ( re. split( '\\.|\\?|\\!' , txt) )
ari_score= 4.71 * ( characters/ words) + 0.5 * ( words/ sentence) - 21.43
return ari_score
"""
http://www.supermagnus.com/mac/Word_Counter/index.html
McAlpine EFLAW© Test
(W + SW) / S
McAlpine EFLAW© Readability
Scale:
1-20: Easy
21-25: Quite Easy
26-29: Mildly Difficult
≥ 30: Very Confusing
S:total sentences
W:total words
"""
def McAlpine_EFLAW ( txt) :
W= len ( re. split( ' |\\n|\\.|\\?|\\!|\,' , txt) )
S= len ( re. split( '\\.|\\?|\\!' , txt) )
mcalpine_eflaw_score= ( W+ S* W) / S
return mcalpine_eflaw_score
"""
https://readable.com/readability/coleman-liau-readability-index/
=0.0588*L-0.296*S-15.8
L是每100个单词有多少个字母,S是平均每100个单词有多少句子.
"""
def CLRI ( txt) :
characters= len ( txt)
words= len ( re. split( ' |\\n|\\.|\\?|\\!|\,' , txt) )
sentence= len ( re. split( '\\.|\\?|\\!' , txt) )
L= 100 * characters/ words
S= 100 * sentence/ words
clri_score= 0.0588 * L- 0.296 * S- 15.8
return clri_score
def get_text_chunk_features ( df) :
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
df[ 'text_length' ] = df[ 'essay' ] . apply ( len )
df[ 'num_newlines' ] = df[ 'essay' ] . apply ( lambda x: x. count( '\n' ) )
df[ 'automated_readability_index' ] = df[ 'essay' ] . apply ( ARI)
df[ 'mcalpine_eflaw' ] = df[ 'essay' ] . apply ( McAlpine_EFLAW)
df[ 'coleman_liau' ] = df[ 'essay' ] . apply ( CLRI)
df[ 'repetitiveness' ] = df[ 'essay' ] . apply ( lambda x: x. count( 'q' ) / max ( len ( x) , 1 ) )
df[ 'avg_word_length' ] = df[ 'essay' ] . apply ( lambda x: sum ( len ( word) for word in x. split( ) ) / max ( 1 , len ( x. split( ) ) ) )
df[ 'word_lexical_diversity' ] = df[ 'essay' ] . apply ( lambda x: len ( set ( x. split( ) ) ) / len ( x. split( ) ) )
df[ 'num_s_quotations' ] = df[ 'essay' ] . apply ( lambda x: x. count( "'" ) )
df[ 'num_d_quotations' ] = df[ 'essay' ] . apply ( lambda x: x. count( '"' ) )
df[ 'qm_count' ] = df[ 'essay' ] . apply ( lambda x: x. count( '?' ) )
df[ 'excm_count' ] = df[ 'essay' ] . apply ( lambda x: x. count( '!' ) )
df[ 'comma_count' ] = df[ 'essay' ] . apply ( lambda x: x. count( ',' ) )
df[ 'dot_count' ] = df[ 'essay' ] . apply ( lambda x: x. count( '.' ) )
df[ 'num_prelist_count' ] = df[ 'essay' ] . apply ( lambda x: x. count( ':' ) ) + \
df[ 'essay' ] . apply ( lambda x: x. count( ";" ) )
df[ "space_n_dot_mistake" ] = df[ 'essay' ] . apply ( lambda x: len ( re. findall( r'\s\.' , x) ) )
df[ "space_n_comma_mistake" ] = df[ 'essay' ] . apply ( lambda x: len ( re. findall( r'\s\,' , x) ) )
df[ "comma_n_nonspace_mistake" ] = df[ 'essay' ] . apply ( lambda x: len ( re. findall( r'\,\S' , x) ) )
df[ "dot_n_nonspace_mistake" ] = df[ 'essay' ] . apply ( lambda x: len ( re. findall( r'\.\S' , x) ) )
df[ "total_punc_mistake" ] = (
df[ "space_n_dot_mistake" ] +
df[ "space_n_comma_mistake" ] +
df[ "comma_n_nonspace_mistake" ] +
df[ "dot_n_nonspace_mistake" ]
)
df[ "punc_mistake_ratio" ] = df[ "total_punc_mistake" ] / ( df[ 'qm_count' ] +
df[ 'excm_count' ] +
df[ 'comma_count' ] +
df[ 'dot_count' ] )
df[ 'unique_word_count' ] = df[ 'essay' ] . apply ( lambda x: len ( set ( re. findall( r'\w+' , x. lower( ) ) ) ) )
df[ 'punctuation_count' ] = df[ 'essay' ] . apply ( lambda x: sum ( x. count( p) for p in punctuation) )
return df
def standardize_text ( txt) :
txt = re. sub( r'\t' , '' , txt)
txt = re. sub( r'\n {1,}' , '\n' , txt)
txt = re. sub( r' {1,}\n' , '\n' , txt)
txt = re. sub( r'\n{2,}' , '\n' , txt)
txt = re. sub( r' {2,}' , ' ' , txt)
txt = txt. strip( )
return txt
def TextProcessor ( inp_df) :
for rowi in range ( len ( inp_df) ) :
if inp_df. loc[ rowi, "essay" ] . replace( " " , "" ) == "" :
inp_df. loc[ rowi, "essay" ] = "q"
inp_df[ "essay" ] = inp_df[ "essay" ] . apply ( lambda x: standardize_text( txt= x) )
print ( "creating complete features" )
inp_df = get_text_chunk_features( inp_df)
wf_df = word_feats( inp_df)
sf_df = sent_feats( inp_df)
pf_df = parag_feats( inp_df)
inp_df = inp_df. merge( wf_df, how= "left" , on= "id" )
inp_df = inp_df. merge( sf_df, how= "left" , on= "id" )
inp_df = inp_df. merge( pf_df, how= "left" , on= "id" )
inp_df. drop( [ "essay" , "word" , "sent" , "paragraph" ] , axis= 1 , inplace= True )
return inp_df
num_cols = [ 'down_time' , 'up_time' , 'action_time' , 'cursor_position' , 'word_count' , 'event_id' ]
activities = [ 'Input' , 'Remove/Cut' , 'Nonproduction' , 'Replace' , 'Paste' ]
events = [ 'q' , 'Space' , 'Backspace' , 'Shift' , 'ArrowRight' , 'Leftclick' , 'ArrowLeft' , '.' , ',' , 'ArrowDown' , 'ArrowUp' , 'Enter' , 'CapsLock' , "'" , 'Delete' , 'Unidentified' ]
text_changes = [ 'q' , ' ' , '.' , ',' , '\n' , "'" , '"' , '-' , '?' , ';' , '=' , '/' , '\\' , ':' ]
def count_by_values ( df, colname, values) :
fts = df. select( pl. col( 'id' ) . unique( maintain_order= True ) )
for i, value in enumerate ( values) :
tmp_df = df. group_by( 'id' ) . agg( pl. col( colname) . is_in( [ value] ) . sum ( ) . alias( f' { colname} _ { i} _cnt' ) )
fts = fts. join( tmp_df, on= 'id' , how= 'left' )
return fts
def pause_stat_aggregator ( df, prefix= "iw" ) :
temp = df. group_by( "id" ) . agg(
pl. max ( 'time_diff' ) . alias( f" { prefix} _max_pause_time" ) ,
pl. median( 'time_diff' ) . alias( f" { prefix} _median_pause_time" ) ,
pl. mean( 'time_diff' ) . alias( f" { prefix} _mean_pause_time" ) ,
pl. min ( 'time_diff' ) . alias( f" { prefix} _min_pause_time" ) ,
pl. std( 'time_diff' ) . alias( f" { prefix} _std_pause_time" ) ,
pl. sum ( 'time_diff' ) . alias( f" { prefix} _total_pause_time" ) ,
pl. col( 'time_diff' ) . filter ( ( pl. col( 'time_diff' ) > 0.5 ) & ( pl. col( 'time_diff' ) <= 1 ) ) . count( ) . alias( f" { prefix} _pauses_half_sec" ) ,
pl. col( 'time_diff' ) . filter ( ( pl. col( 'time_diff' ) > 1 ) & ( pl. col( 'time_diff' ) <= 2 ) ) . count( ) . alias( f" { prefix} _pauses_1_sec" ) ,
pl. col( 'time_diff' ) . filter ( ( pl. col( 'time_diff' ) > 2 ) & ( pl. col( 'time_diff' ) <= 3 ) ) . count( ) . alias( f" { prefix} _pauses_2_sec" ) ,
pl. col( 'time_diff' ) . filter ( pl. col( 'time_diff' ) > 3 ) . count( ) . alias( f" { prefix} _pauses_3_sec" )
)
return temp
def dev_feats ( df) :
print ( "< Count by values features >" )
feats = count_by_values( df, 'activity' , activities)
feats = feats. join( count_by_values( df, 'text_change' , text_changes) , on= 'id' , how= 'left' )
feats = feats. join( count_by_values( df, 'down_event' , events) , on= 'id' , how= 'left' )
print ( "< Numerical columns features >" )
temp = df. group_by( "id" ) . agg( pl. sum ( 'action_time' ) . suffix( '_sum' ) , pl. std( num_cols) . suffix( '_std' ) ,
pl. median( num_cols) . suffix( '_median' ) , pl. min ( num_cols) . suffix( '_min' ) ,
pl. max ( num_cols) . suffix( '_max' ) ,
)
feats = feats. join( temp, on= 'id' , how= 'left' )
print ( "< Categorical columns features >" )
temp = df. group_by( "id" ) . agg( pl. n_unique( [ 'activity' , 'down_event' , 'up_event' , 'text_change' ] ) )
feats = feats. join( temp, on= 'id' , how= 'left' )
print ( "< Creating pause features >" )
temp = df. with_columns( pl. col( 'up_time' ) . shift( ) . over( 'id' ) . alias( 'up_time_lagged' ) )
temp = temp. with_columns( ( abs ( pl. col( 'down_time' ) - pl. col( 'up_time_lagged' ) ) / 1000 ) . fill_null( 0 ) . alias( 'time_diff' ) )
temp = temp. with_columns( ( pl. col( "up_event" ) == "Space" ) . alias( "is_space" ) )
temp = temp. with_columns( ( pl. col( "up_event" ) == "." ) . alias( "is_dot" ) )
temp = temp. with_columns( ( pl. col( "up_event" ) == "Enter" ) . alias( "is_enter" ) )
temp = temp. with_columns(
pl. col( "is_space" ) . cumsum( ) . shift( ) . backward_fill( ) . over( "id" ) . alias( "word_id" ) ,
pl. col( "is_dot" ) . cumsum( ) . shift( ) . backward_fill( ) . over( "id" ) . alias( "sentence_id" ) ,
pl. col( "is_enter" ) . cumsum( ) . shift( ) . backward_fill( ) . over( "id" ) . alias( "paragraph_id" ) ,
)
temp = temp. filter ( pl. col( 'activity' ) . is_in( [ 'Input' , 'Remove/Cut' ] ) )
iw_df = pause_stat_aggregator( df= temp, prefix= "iw" )
bww_df = temp. group_by( "id" , "word_id" ) . agg( pl. col( "time_diff" ) . first( ) )
bww_df = pause_stat_aggregator( df= bww_df, prefix= "bww" )
bws_df = temp. group_by( "id" , "sentence_id" ) . agg( pl. col( "time_diff" ) . first( ) )
bws_df = pause_stat_aggregator( df= bws_df, prefix= "bws" )
bwp_df = temp. group_by( "id" , "paragraph_id" ) . agg( pl. col( "time_diff" ) . first( ) )
bwp_df = pause_stat_aggregator( df= bwp_df, prefix= "bwp" )
feats = ( feats. join( iw_df, on= "id" , how= "left" )
. join( bww_df, on= "id" , how= "left" )
. join( bws_df, on= "id" , how= "left" )
. join( bwp_df, on= "id" , how= "left" )
)
feats= feats. to_pandas( )
return feats
def get_keys_pressed_per_second ( logs) :
temp_df = logs[ logs[ 'activity' ] . isin( [ 'Input' , 'Remove/Cut' ] ) ] . groupby( [ 'id' ] ) . agg( keys_pressed= ( 'event_id' , 'count' ) ) . reset_index( )
temp_df_2 = logs. groupby( [ 'id' ] ) . agg( min_down_time= ( 'down_time' , 'min' ) , max_up_time= ( 'up_time' , 'max' ) ) . reset_index( )
temp_df = temp_df. merge( temp_df_2, on= 'id' , how= 'left' )
temp_df[ 'keys_per_second' ] = temp_df[ 'keys_pressed' ] / ( ( temp_df[ 'max_up_time' ] - temp_df[ 'min_down_time' ] ) / 1000 )
return temp_df[ [ 'id' , 'keys_per_second' ] ]
def burst_features ( df, burst_type= "p" ) :
temp = df. with_columns( pl. col( 'up_time' ) . shift( ) . over( 'id' ) . alias( 'up_time_lagged' ) )
temp = temp. with_columns( ( abs ( pl. col( 'down_time' ) - pl. col( 'up_time_lagged' ) ) / 1000 ) . fill_null( 0 ) . alias( 'time_diff' ) )
if burst_type == "p" :
temp = temp. with_columns( pl. col( 'activity' ) . is_in( [ 'Input' ] ) )
elif burst_type == "r" :
temp = temp. with_columns( pl. col( 'activity' ) . is_in( [ 'Remove/Cut' ] ) )
temp = temp. with_columns( ( pl. col( 'action_time' ) / 1000 ) . alias( "action_time_s" ) )
temp = temp. with_columns( ( pl. col( 'up_time' ) / 1000 ) . alias( "up_time_s" ) )
temp = temp. with_columns( pl. when( pl. col( "activity" ) ) . then( pl. col( "activity" ) . rle_id( ) ) . alias( f' { burst_type} _burst_group' ) )
temp = temp. drop_nulls( )
temp = temp. group_by( "id" , f" { burst_type} _burst_group" ) . agg(
pl. count( 'activity' ) . alias( f' { burst_type} _burst_group_keypress_count' ) ,
pl. sum ( 'action_time_s' ) . alias( f' { burst_type} _burst_group_timespent' ) ,
pl. mean( 'action_time_s' ) . alias( f' { burst_type} _burst_keypress_timespent_mean' ) ,
pl. std( 'action_time_s' ) . alias( f' { burst_type} _burst_keypress_timespent_std' ) ,
pl. min ( 'up_time_s' ) . alias( f' { burst_type} _burst_keypress_timestamp_first' ) ,
pl. max ( 'up_time_s' ) . alias( f' { burst_type} _burst_keypress_timestamp_last' )
)
temp = temp. group_by( "id" ) . agg(
pl. sum ( f' { burst_type} _burst_group_keypress_count' ) . alias( f' { burst_type} _burst_keypress_count_sum' ) ,
pl. mean( f' { burst_type} _burst_group_keypress_count' ) . alias( f' { burst_type} _burst_keypress_count_mean' ) ,
pl. std( f' { burst_type} _burst_group_keypress_count' ) . alias( f' { burst_type} _burst_keypress_count_std' ) ,
pl. max ( f' { burst_type} _burst_group_keypress_count' ) . alias( f' { burst_type} _burst_keypress_count_max' ) ,
pl. sum ( f' { burst_type} _burst_group_timespent' ) . alias( f' { burst_type} _burst_timespent_sum' ) ,
pl. mean( f' { burst_type} _burst_group_timespent' ) . alias( f' { burst_type} _burst_timespent_mean' ) ,
pl. std( f' { burst_type} _burst_group_timespent' ) . alias( f' { burst_type} _burst_timespent_std' ) ,
pl. max ( f' { burst_type} _burst_group_timespent' ) . alias( f' { burst_type} _burst_timespent_max' ) ,
pl. mean( f' { burst_type} _burst_keypress_timespent_mean' ) . alias( f' { burst_type} _burst_keypress_timespent_mean' ) ,
pl. mean( f' { burst_type} _burst_keypress_timespent_std' ) . alias( f' { burst_type} _burst_keypress_timespent_std' ) ,
pl. min ( f' { burst_type} _burst_keypress_timestamp_first' ) . alias( f' { burst_type} _burst_keypress_timestamp_first' ) ,
pl. max ( f' { burst_type} _burst_keypress_timestamp_last' ) . alias( f' { burst_type} _burst_keypress_timestamp_last' )
)
temp = temp. to_pandas( )
return temp
def Preprocessor ( logs) :
pl_logs = pl. from_pandas( logs)
print ( "< Creating keys_pressed_per_second features >" )
feat_df = get_keys_pressed_per_second( logs)
feat_df = feat_df. merge( dev_feats( df= pl_logs) , how= "left" , on= "id" )
print ( "< Creating PR-Burst features >" )
feat_df = feat_df. merge( burst_features( df= pl_logs, burst_type= "p" ) , how= "left" , on= "id" )
feat_df = feat_df. merge( burst_features( df= pl_logs, burst_type= "r" ) , how= "left" , on= "id" )
essays = logs. groupby( "id" ) . apply ( get_Essays)
essays = pd. DataFrame( essays. tolist( ) , columns= [ "id" , "essay" ] )
essay_feats = TextProcessor( essays)
feat_df= feat_df. merge( essay_feats, how= "left" , on= "id" )
feat_df[ "p_bursts_time_ratio" ] = feat_df[ "p_burst_timespent_sum" ] / ( feat_df[ "up_time_max" ] / 1000 )
feat_df[ "r_bursts_time_ratio" ] = feat_df[ "r_burst_timespent_sum" ] / ( feat_df[ "up_time_max" ] / 1000 )
feat_df[ "action_time_ratio" ] = feat_df[ "action_time_sum" ] / feat_df[ "up_time_max" ]
feat_df[ "pause_time_ratio" ] = feat_df[ "iw_total_pause_time" ] / ( feat_df[ "up_time_max" ] / 1000 )
feat_df[ "pausecount_time_ratio" ] = feat_df[ "iw_pauses_2_sec" ] / ( feat_df[ "up_time_max" ] / 1000 )
feat_df[ 'word_time_ratio' ] = feat_df[ 'word_count_max' ] / ( feat_df[ "up_time_max" ] / 1000 )
feat_df[ 'word_event_ratio' ] = feat_df[ 'word_count_max' ] / feat_df[ "event_id_max" ]
feat_df[ 'event_time_ratio' ] = feat_df[ 'event_id_max' ] / ( feat_df[ "up_time_max" ] / 1000 )
feat_df[ "text_length_time_ratio" ] = feat_df[ "text_length" ] / ( feat_df[ "up_time_max" ] / 1000 )
return feat_df
train_logs= pd. read_csv( "/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv" )
print ( f"len(train_logs): { len ( train_logs) } " )
train_logs= train_logs. sort_values( by= [ 'id' , 'down_time' ] )
train_logs = train_logs. reset_index( drop= True )
train_logs[ 'event_id' ] = train_logs. groupby( 'id' ) . cumcount( ) + 1
train_scores= pd. read_csv( "/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv" )
test_logs= pd. read_csv( "/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv" )
print ( f"len(test_logs): { len ( test_logs) } " )
test_logs= test_logs. sort_values( by= [ 'id' , 'down_time' ] )
test_logs = test_logs. reset_index( drop= True )
test_logs[ 'event_id' ] = test_logs. groupby( 'id' ) . cumcount( ) + 1
print ( "feature engineer" )
train_feats = Preprocessor( train_logs)
train_feats = train_feats. merge( train_scores, how= "left" , on= "id" )
test_feats = Preprocessor( test_logs)
keys= train_feats. keys( ) . values
unique_cols= [ key for key in keys if train_feats[ key] . nunique( ) < 2 ]
print ( f"drop unique_cols: { unique_cols} " )
train_feats = train_feats. drop( columns= unique_cols)
test_feats = test_feats. drop( columns= unique_cols)
train_feats. replace( [ np. inf, - np. inf] , np. nan, inplace= True )
test_feats. replace( [ np. inf, - np. inf] , np. nan, inplace= True )
train_feats. drop( [ 'id' ] , axis= 1 , inplace= True )
print ( f"total_feats_counts: { len ( test_feats. keys( ) . values) } " )
def make_model ( ) :
cat_params = { 'learning_rate' : 0.024906985231770738 , 'depth' : 5 ,
'l2_leaf_reg' : 3.7139894959529283 , 'subsample' : 0.18527466886647015 ,
'colsample_bylevel' : 0.6552973951000719 , 'min_data_in_leaf' : 93 ,
"silent" : True , "iterations" : 1000 , "random_state" : seed, "use_best_model" : False
}
lgb_params= { 'reg_alpha' : 1.0894488472899402 , 'reg_lambda' : 6.290929934336985 ,
'colsample_bytree' : 0.6218522907548012 , 'subsample' : 0.9579924238280629 ,
'learning_rate' : 0.0027076430412427566 , 'max_depth' : 8 , 'num_leaves' : 947 ,
'min_child_samples' : 57 , 'n_estimators' : 2500 , 'metric' : 'rmse' ,
'random_state' : seed, 'verbosity' : - 1 , 'force_col_wise' : True
}
xgb_params= { 'max_depth' : 2 , 'learning_rate' : 0.009998236038809146 ,
'n_estimators' : 1000 , 'min_child_weight' : 17 ,
'gamma' : 0.1288249858838246 , 'subsample' : 0.5078057280148618 ,
'colsample_bytree' : 0.7355762136239921 , 'reg_alpha' : 0.670956206987811 ,
'reg_lambda' : 0.06818351284100388 , 'random_state' : seed
}
model1 = LGBMRegressor( ** lgb_params)
model2 = CatBoostRegressor( ** cat_params)
model3 = XGBRegressor( ** xgb_params)
models = [ ]
models. append( ( model1, 'lgb' ) )
models. append( ( model2, 'cat' ) )
models. append( ( model3, 'xgb' ) )
return models
def RMSE ( y_true, y_pred) :
return np. sqrt( np. mean( ( y_true- y_pred) ** 2 ) )
X= train_feats. drop( [ 'score' ] , axis= 1 )
y= train_feats[ 'score' ]
models_and_errors_dict = { }
y_hats = dict ( )
submission_df = pd. DataFrame( test_feats[ 'id' ] )
submission_df[ 'score' ] = 3.5
X_unseen = test_feats. drop( [ 'id' ] , axis= 1 ) . copy( )
num_folds= 10
for model, model_type in make_model( ) :
oof_pred= np. zeros( ( len ( y) ) )
y_hats[ model_type] = [ ]
skf = StratifiedKFold( n_splits= num_folds, random_state= seed, shuffle= True )
for fold, ( train_index, valid_index) in ( enumerate ( skf. split( X, y. astype( str ) ) ) ) :
X_train, X_test = X. iloc[ train_index] , X. iloc[ valid_index]
y_train, y_test = y. iloc[ train_index] , y. iloc[ valid_index]
X_train_copy, X_test_copy = X_train. copy( ) , X_test. copy( )
model. fit( X_train_copy, y_train)
y_hat = model. predict( X_test_copy)
oof_pred[ valid_index] = y_hat
rmse = RMSE( y_test, y_hat)
print ( f'RMSE: { rmse} on fold { fold} ' )
X_unseen_copy = X_unseen. copy( )
X_unseen_copy= X_unseen_copy
y_hats[ model_type] . append( model. predict( X_unseen_copy) )
if model_type not in models_and_errors_dict:
models_and_errors_dict[ model_type] = [ ]
models_and_errors_dict[ model_type] . append( ( model, rmse, None , None , oof_pred) )
for key in y_hats. keys( ) :
if y_hats[ key] :
y_hat_avg = np. mean( y_hats[ key] , axis= 0 )
submission_df[ 'score_' + key] = y_hat_avg
submission_df. head( )
blending_weights = {
'lgb' : 0.4 ,
'cat' : 0.4 ,
'xgb' : 0.2 ,
}
lgb_oof_pred= models_and_errors_dict[ 'lgb' ] [ num_folds- 1 ] [ 4 ]
cat_oof_pred= models_and_errors_dict[ 'cat' ] [ num_folds- 1 ] [ 4 ]
xgb_oof_pred= models_and_errors_dict[ 'xgb' ] [ num_folds- 1 ] [ 4 ]
margin= 1000
target= y. values
current_RMSE= RMSE( target, ( lgb_oof_pred+ cat_oof_pred+ xgb_oof_pred) / 3 )
best_i= 0
best_j= 0
for i in range ( 0 , margin) :
for j in range ( 0 , margin- i) :
blend_oof_pred= ( i* lgb_oof_pred+ j* cat_oof_pred+ ( margin- i- j) * xgb_oof_pred) / margin
if RMSE( target, blend_oof_pred) < current_RMSE:
current_RMSE= RMSE( target, blend_oof_pred)
best_i= i
best_j= j
blending_weights[ 'lgb' ] = best_i/ margin
blending_weights[ 'cat' ] = best_j/ margin
blending_weights[ 'xgb' ] = ( margin- best_i- best_j) / margin
print ( f"current_RMSE: { current_RMSE} ,blending_weights: { blending_weights} " )
print ( "blending" )
blended_score= np. zeros( ( len ( test_feats) ) )
for k, v in blending_weights. items( ) :
blended_score += submission_df[ 'score_' + k] * v
print ( f"blended_score: { blended_score} " )
submission= pd. read_csv( "/kaggle/input/linking-writing-processes-to-writing-quality/sample_submission.csv" )
submission[ 'score' ] = blended_score
submission. to_csv( "submission.csv" , index= None )
submission. head( )