LGBMRegressor CatBoostRegressor XGBRegressor回归

import pandas as pd#导入csv文件的库
import numpy as np#进行矩阵运算的库
import polars as pl#和pandas类似,但是处理大型数据集有更好的性能.
#用于对一组元素计数,一个存在默认值的字典,访问不存在的值时抛出的是默认值
from collections import Counter,defaultdict
import re#用于正则表达式提取
from scipy.stats import skew, kurtosis#统计分析和概率分布导入偏度和峰度

#model
from lightgbm import LGBMRegressor#导入lgbm回归器
from catboost import CatBoostRegressor#catboost回归器
from xgboost import XGBRegressor#导入XGB回归器

#KFold是直接分成k折,StratifiedKFold还要考虑每种类别的占比
from sklearn.model_selection import StratifiedKFold

#设置随机种子,保证模型可以复现
import random
seed=2024
np.random.seed(seed)
random.seed(seed)

import warnings#避免一些可以忽略的报错
warnings.filterwarnings('ignore')#filterwarnings()方法是用于设置警告过滤器的方法，它可以控制警告信息的输出方式和级别。

def get_Essays(df):
    #传入的是其中一个id的论文,取出id
    USER_ID = df["id"].iloc[0]
    #这个id和论文内容相关的几列
    textInputDf = df[['activity', 'cursor_position', 'text_change']]
    #对论文没有变化的动作不用管
    currTextInput = textInputDf[textInputDf.activity != 'Nonproduction']
    essayText = ""#开始重构论文内容.
    for Input in currTextInput.values:#取出一个'activity', 'cursor_position', 'text_change'
        #input[0]是这个id的activity
        if Input[0] == 'Replace':
            #text_change按照' => '分开 replaceTxt:[' qqq qqqqq ', ' ']
            replaceTxt = Input[2].split(' => ')#应该是A=>B的操作
            #input[1]是鼠标位置,是一个数字 鼠标位置-len()
            #这是一个字符串的转换操作,由replaceTxt[0]转成replaceTxt[1] 
            essayText = essayText[:Input[1] - len(replaceTxt[1])] + replaceTxt[1] +essayText[Input[1] - len(replaceTxt[1]) + len(replaceTxt[0]):]
            continue
        if Input[0] == 'Paste':#在Input[1]的位置粘贴Input[2]
            essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
            continue
        if Input[0] == 'Remove/Cut':#删除剪切 在Input[1]的位置删除Input[2]
            essayText = essayText[:Input[1]] + essayText[Input[1] + len(Input[2]):]
            continue
        #如果是Move from
        if "M" in Input[0]:
            #[284, 292] To [282, 290] 把[284, 292]这8行移动到[282,290]
            croppedTxt = Input[0][10:]
            #from和to的4个数字分开.
            splitTxt = croppedTxt.split(' To ')
            valueArr = [item.split(', ') for item in splitTxt]
            moveData = (int(valueArr[0][0][1:]), 
                        int(valueArr[0][1][:-1]), 
                        int(valueArr[1][0][1:]), 
                        int(valueArr[1][1][:-1]))
            #位置不相等,如果位置相等,等于什么都没有做
            if moveData[0] != moveData[2]:
                #行号小于 
                if moveData[0] < moveData[2]:
                    essayText = essayText[:moveData[0]] + essayText[moveData[1]:moveData[3]] +\
                    essayText[moveData[0]:moveData[1]] + essayText[moveData[3]:]
                #行号大于
                else:
                    essayText = essayText[:moveData[2]] + essayText[moveData[0]:moveData[1]] +\
                    essayText[moveData[2]:moveData[0]] + essayText[moveData[1]:]
            continue
        #由于continue会跳过这个循环,这里应该是在Input[1]的位置'Input'了Input[2]
        essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]     
            
    return USER_ID, essayText
AGGREGATIONS = ['count','min','max','first','last', 'median','sum','std']

#将论文转成单词
def word_feats(df):
    essay_df = df
    #对空格,\n,句号问号感叹号,逗号进行匹配,得到一个拆分后的列表.
    essay_df['word'] = essay_df['essay'].apply(lambda x: re.split(' |\\n|\\.|\\?|\\!|\\,',x))
    # essay1 [1,2,3] essay2[4,5] ->5行 essay1 1  // essay1 2 // essay1 3 // essay2 1 // essay2 2
    essay_df = essay_df.explode('word')
    #求出每个单词的长度
    essay_df['word_len'] = essay_df['word'].apply(lambda x: len(x))
    #去掉单词长度为0的数据,应该是标点符号和换行符之间,换行符与空格之间之类的.
    word_df = essay_df[essay_df['word_len'] != 0]
   
    #根据id计算单词长度的统计学变量
    word_agg_df = word_df[['id','word_len']].groupby(['id']).agg(AGGREGATIONS)
    #比如('mean','word_len')->'mean_word_len'
    word_agg_df.columns = ['_'.join(x) for x in word_agg_df.columns]
    word_agg_df['id'] = word_agg_df.index
#     for word_l in [5, 6, 7, 8, 9, 10, 11, 12]:
#         #ge 就是Latex里>=的符号,筛选出word_len>=word_l的行,根据id进行统计,提取每个计数的第0行
#         word_agg_df[f'word_len_ge_{word_l}_count'] = word_df[word_df['word_len'] >= word_l].groupby(['id']).count().iloc[:, 0]
#         #如果有缺失值就填充为0
#         word_agg_df[f'word_len_ge_{word_l}_count'] = word_agg_df[f'word_len_ge_{word_l}_count'].fillna(0)
    #重置索引
    word_agg_df = word_agg_df.reset_index(drop=True)
    return word_agg_df

#句子特征
def sent_feats(df):
    essay_df = df#传入的df就是论文的df
    #对句子按照. ? !进行拆分. 得到一个拆分后的列表.
    essay_df['sent'] = essay_df['essay'].apply(lambda x: re.split('\\.|\\?|\\!',x))
    # essay1 [1,2,3] essay2[4,5] ->5行 essay1 1  // essay1 2 // essay1 3 // essay2 1 // essay2 2
    essay_df = essay_df.explode('sent')
    #将换行符'\n'变成空白字符 strip 去除行头和行尾的空白字符.
    essay_df['sent'] = essay_df['sent'].apply(lambda x: x.replace('\n','').strip())
    #统计一下每个句子的长度 
    essay_df['sent_len'] = essay_df['sent'].apply(lambda x: len(x))
    #求一下每个句子单词的个数.
    essay_df['sent_word_count'] = essay_df['sent'].apply(lambda x: len(x.split(' ')))
    #去掉那些句子长度为0的数据
    df = essay_df[essay_df.sent_len!=0].reset_index(drop=True)
    
    #统计句子长度的统计学变量和每个句子词数的统计学变量
    sent_agg_df = pd.concat(
        [df[['id','sent_len']].groupby(['id']).agg(AGGREGATIONS), df[['id','sent_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1
    )
    #比如('mean','sent_len')->'mean_sent_len'
    sent_agg_df.columns = ['_'.join(x) for x in sent_agg_df.columns]
    sent_agg_df['id'] = sent_agg_df.index
    
#     # New features intoduced here: https://www.kaggle.com/code/mcpenguin/writing-processes-to-quality-baseline-v2
#     for sent_l in [50, 60, 75, 100]:
#         #ge 就是Latex里>=的符号,筛选出sent_len>=sent_l的行,根据id进行统计,提取每个计数的第0行
#         sent_agg_df[f'sent_len_ge_{sent_l}_count'] = df[df['sent_len'] >= sent_l].groupby(['id']).count().iloc[:, 0]
#         #如果有缺失值就填充为0
#         sent_agg_df[f'sent_len_ge_{sent_l}_count'] = sent_agg_df[f'sent_len_ge_{sent_l}_count'].fillna(0)
    #重置索引
    sent_agg_df = sent_agg_df.reset_index(drop=True)
    #一句话里词的个数的count,其实就是有多少句话,也就是sent_len的count.重复了,故去掉.
    sent_agg_df.drop(columns=["sent_word_count_count"], inplace=True)
    #sent_len_count其实就是有多少句话,故rename.
    sent_agg_df = sent_agg_df.rename(columns={"sent_len_count":"sent_count"})
    return sent_agg_df

def parag_feats(df):
    essay_df = df
    #按照'\n'划分成段落 [1,2,3]
    essay_df['paragraph'] = essay_df['essay'].apply(lambda x: x.split('\n'))
    #[论文1 [段落1 段落2,……]->[论文1 段落1 // 论文1 段落2]
    essay_df = essay_df.explode('paragraph')
    #统计段落的长度
    essay_df['paragraph_len'] = essay_df['paragraph'].apply(lambda x: len(x)) 
    #统计每个段落的词数
    essay_df['paragraph_word_count'] = essay_df['paragraph'].apply(lambda x: len(x.split(' ')))
    essay_df['paragraph_sent_count'] = essay_df['paragraph'].apply(lambda x: len(re.split('\\.|\\?|\\!',x)))
    #将段落长度>2的文本留下来.
    df = essay_df[essay_df.paragraph_len>2].reset_index(drop=True)

    paragraph_agg_df = pd.concat(
        [df[['id','paragraph_len']].groupby(['id']).agg(AGGREGATIONS), 
         df[['id','paragraph_word_count']].groupby(['id']).agg(AGGREGATIONS),
         df[['id','paragraph_sent_count']].groupby(['id']).agg(AGGREGATIONS)
        ], axis=1
    ) 
    paragraph_agg_df.columns = ['_'.join(x) for x in paragraph_agg_df.columns]
    paragraph_agg_df['id'] = paragraph_agg_df.index
    paragraph_agg_df = paragraph_agg_df.reset_index(drop=True)
    #段落词数的数量就是段落的数量,也就是paragraph_len_count,段落句子数的数量同理,故drop
    paragraph_agg_df.drop(columns=["paragraph_word_count_count", "paragraph_sent_count_count"], inplace=True)
    paragraph_agg_df = paragraph_agg_df.rename(columns={"paragraph_len_count":"paragraph_count"})
    return paragraph_agg_df

#文本的自动可读性指数 旨在衡量文本的可理解性.输出是理解课文所需的美国年级水平的近似表示.
#https://www.nhooo.com/note/qa0tpe.html
#初步理解:相同词数的情况下,句子越少,说明句子相对来说会很长,越长越不容易理解.words/sentence就会越大.
#字符数相同的情况下,词数越多,单词越短,短的单词可能简单,所以就好理解.characters/words变小.
#数值小就好理解,数值大就不好理解.具体的公式可能用数据做过实验得出?
def ARI(txt):
    characters=len(txt)
    words=len(re.split(' |\\n|\\.|\\?|\\!|\,',txt))#空格,换行符,句号,问号,感叹号,逗号分开.
    sentence=len(re.split('\\.|\\?|\\!',txt))#句号,问号,感叹号分开的句子.
    ari_score=4.71*(characters/words)+0.5*(words/sentence)-21.43
    return ari_score
"""
http://www.supermagnus.com/mac/Word_Counter/index.html
McAlpine EFLAW© Test
     (W + SW) / S
McAlpine EFLAW© Readability
     Scale:
     1-20: Easy
     21-25: Quite Easy
     26-29: Mildly Difficult
     ≥ 30: Very Confusing
     S:total sentences
     W:total words
"""
def McAlpine_EFLAW(txt):
    W=len(re.split(' |\\n|\\.|\\?|\\!|\,',txt))#空格,换行符,句号,问号,感叹号,逗号分开.
    S=len(re.split('\\.|\\?|\\!',txt))#句号,问号,感叹号分开的句子.
    mcalpine_eflaw_score=(W+S*W)/S
    return mcalpine_eflaw_score
"""
https://readable.com/readability/coleman-liau-readability-index/

=0.0588*L-0.296*S-15.8
L是每100个单词有多少个字母,S是平均每100个单词有多少句子.
"""
def CLRI(txt):
    characters=len(txt)
    words=len(re.split(' |\\n|\\.|\\?|\\!|\,',txt))#空格,换行符,句号,问号,感叹号,逗号分开.
    sentence=len(re.split('\\.|\\?|\\!',txt))#句号,问号,感叹号分开的句子.
    L=100*characters/words
    S=100*sentence/words
    clri_score=0.0588*L-0.296*S-15.8
    return clri_score
#获取文本块的特征.
def get_text_chunk_features(df):
    punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    #计算论文的长度
    df['text_length'] = df['essay'].apply(len)

    #计算换行符的个数.
    df['num_newlines'] = df['essay'].apply(lambda x: x.count('\n'))
    #文本可读性的3个指标
    df['automated_readability_index'] = df['essay'].apply(ARI)
    df['mcalpine_eflaw'] = df['essay'].apply(McAlpine_EFLAW)        
    df['coleman_liau'] = df['essay'].apply(CLRI)

    #字符‘q’在总字符数中的占比,其他的可能是空格和标点符号
    df['repetitiveness'] = df['essay'].apply(lambda x: x.count('q') / max(len(x), 1))

    #平均词长度=所有的词的总长度/总词数
    df['avg_word_length'] = df['essay'].apply(lambda x: sum(len(word) for word in x.split()) / max(1, len(x.split())))
    #单词紧急多样性 set(x.split())=['qq','qqq','qqqq'……],len(x.split())是论文总词数
    df['word_lexical_diversity'] = df['essay'].apply(lambda x: len(set(x.split())) / len(x.split()))
        
    #单引号和双引号的使用次数.
    df['num_s_quotations'] = df['essay'].apply(lambda x: x.count("'"))
    df['num_d_quotations'] = df['essay'].apply(lambda x: x.count('"'))
    #问号、感叹号,逗号,句号的使用次数
    df['qm_count'] = df['essay'].apply(lambda x: x.count('?'))
    df['excm_count'] = df['essay'].apply(lambda x: x.count('!'))
    df['comma_count'] = df['essay'].apply(lambda x: x.count(','))
    df['dot_count'] = df['essay'].apply(lambda x: x.count('.'))
    #冒号和分号的使用次数
    df['num_prelist_count'] = df['essay'].apply(lambda x: x.count(':')) +\
                            df['essay'].apply(lambda x: x.count(";"))    
    
    #空白字符例如:空格,换行符,制表符后面有一个句号的错误.
    df["space_n_dot_mistake"] = df['essay'].apply(lambda x: len(re.findall(r'\s\.', x)))
    #空白字符例如:空格,换行符,制表符后面有一个逗号的错误.
    df["space_n_comma_mistake"] = df['essay'].apply(lambda x: len(re.findall(r'\s\,', x)))
    #空白字符例如:空格,换行符,制表符前面有一个句号的错误.
    df["comma_n_nonspace_mistake"] = df['essay'].apply(lambda x: len(re.findall(r'\,\S', x)))
    #空白字符例如:空格,换行符,制表符前面有一个逗号的错误.
    df["dot_n_nonspace_mistake"] = df['essay'].apply(lambda x: len(re.findall(r'\.\S', x)))
    #总共有几个错误,对错误做一个汇总.
    df["total_punc_mistake"] = (
        df["space_n_dot_mistake"] +
        df["space_n_comma_mistake"] +
        df["comma_n_nonspace_mistake"] +
        df["dot_n_nonspace_mistake"]
    )
    #标点符号犯错误的次数/4种标点符号的使用次数(问号和感叹号出现次数不多)
    df["punc_mistake_ratio"] = df["total_punc_mistake"] / (df['qm_count'] +
                                                           df['excm_count'] +
                                                           df['comma_count'] +
                                                           df['dot_count'])

    #统计不同的单词种类. 
    df['unique_word_count'] = df['essay'].apply(lambda x: len(set(re.findall(r'\w+', x.lower()))))
    
    #统计每种标点符号出现的次数和
    df['punctuation_count'] = df['essay'].apply(lambda x: sum(x.count(p) for p in punctuation)) 

    return df

#对一段文本txt进行处理的函数
def standardize_text(txt):
    txt = re.sub(r'\t' , '', txt)#将文本中的制表符替换成空字符串
    txt = re.sub(r'\n {1,}' , '\n', txt)#换行符后面如果有空格,去掉
    txt = re.sub(r' {1,}\n' , '\n', txt)#换行符前面有空格的话去掉
    txt = re.sub(r'\n{2,}' , '\n', txt)#如果有2个以上的换行符,替换成单个换行符
    txt = re.sub(r' {2,}' , ' ', txt)#如果有出现连续2个空格,替换成单个空格.
    txt = txt.strip()#开头和结尾如果有空白字符去掉
    return txt#返回经过处理的文本
def TextProcessor(inp_df):

    for rowi in range(len(inp_df)):
        #如果一篇论文删除空格后什么都没有,替换为'q',好心假装你写了东西.
        if inp_df.loc[rowi, "essay"].replace(" ", "") == "":
            inp_df.loc[rowi, "essay"] = "q"   
    
    #对传入的论文去掉一些多余的字符.
    inp_df["essay"] = inp_df["essay"].apply(lambda x: standardize_text(txt=x))
    
    #获取文本块的相关特征.
    print("creating complete features")
    inp_df = get_text_chunk_features(inp_df) 
    #获取文本的单词,句子,段落特征
    wf_df = word_feats(inp_df)
    sf_df = sent_feats(inp_df)
    pf_df = parag_feats(inp_df)
    #将论文
    inp_df = inp_df.merge(wf_df, how="left", on="id")
    inp_df = inp_df.merge(sf_df, how="left", on="id")
    inp_df = inp_df.merge(pf_df, how="left", on="id")
    #提取好特征,把论文,词语,句子,段落去掉
    inp_df.drop(["essay", "word", "sent", "paragraph"],axis=1,inplace=True)
    
    return inp_df

#数值列,activity,event和text_change的重要类别
num_cols = ['down_time', 'up_time', 'action_time', 'cursor_position', 'word_count', 'event_id']
activities = ['Input', 'Remove/Cut', 'Nonproduction', 'Replace', 'Paste']
events = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']
text_changes = ['q', ' ', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']

#df表格的colname列统计values的count.
def count_by_values(df, colname, values):
    #maintain_order=True保持原有顺序
    fts = df.select(pl.col('id').unique(maintain_order=True))
    for i, value in enumerate(values):
        #根据每个id判断colname是不是value并统计个数,rename成colname_i_cnt
        tmp_df = df.group_by('id').agg(pl.col(colname).is_in([value]).sum().alias(f'{colname}_{i}_cnt'))
        #加上这个特征
        fts  = fts.join(tmp_df, on='id', how='left') 
    return fts
#暂停状态的特征聚合
def pause_stat_aggregator(df, prefix="iw"):
    temp = df.group_by("id").agg(
           #根据id计算time_diff的最大,中位数,均值,最小值,方差,sum
           pl.max('time_diff').alias(f"{prefix}_max_pause_time"),
           pl.median('time_diff').alias(f"{prefix}_median_pause_time"),
           pl.mean('time_diff').alias(f"{prefix}_mean_pause_time"),
           pl.min('time_diff').alias(f"{prefix}_min_pause_time"),
           pl.std('time_diff').alias(f"{prefix}_std_pause_time"),
           pl.sum('time_diff').alias(f"{prefix}_total_pause_time"),
           # time_diff在(0.5,1) (1,2) (2,3) >3的count.
           pl.col('time_diff').filter((pl.col('time_diff') > 0.5) & (pl.col('time_diff') <= 1)).count().alias(f"{prefix}_pauses_half_sec"),
           pl.col('time_diff').filter((pl.col('time_diff') > 1) & (pl.col('time_diff') <= 2)).count().alias(f"{prefix}_pauses_1_sec"),
           pl.col('time_diff').filter((pl.col('time_diff') > 2) & (pl.col('time_diff') <= 3)).count().alias(f"{prefix}_pauses_2_sec"),
           pl.col('time_diff').filter(pl.col('time_diff') > 3).count().alias(f"{prefix}_pauses_3_sec")
    )
    return temp  

def dev_feats(df):
    
    print("< Count by values features >")
    
    #统计activity,text_change,down_event,up_event这几个类别型变量的count down_event和up_event基本一样.
    feats = count_by_values(df, 'activity', activities)
    feats = feats.join(count_by_values(df, 'text_change', text_changes), on='id', how='left') 
    feats = feats.join(count_by_values(df, 'down_event', events), on='id', how='left') 

    print("< Numerical columns features >")

    #对action_time求和,对数值型变量求均值,方差,中位数,最小值,最大值,50%的数字 没有pl.mean(num_cols).suffix('_mean')和pl.quantile(num_cols, 0.5).suffix('_quantile')
    temp = df.group_by("id").agg(pl.sum('action_time').suffix('_sum'),pl.std(num_cols).suffix('_std'),
                                 pl.median(num_cols).suffix('_median'), pl.min(num_cols).suffix('_min'),
                                 pl.max(num_cols).suffix('_max'),
                                 )
    feats = feats.join(temp, on='id', how='left') 
    
    print("< Categorical columns features >")
    #类别型变量求了n_unique,加入特征.
    temp  = df.group_by("id").agg(pl.n_unique(['activity', 'down_event', 'up_event', 'text_change']))
    feats = feats.join(temp, on='id', how='left') 
    
    
    print("< Creating pause features >")   
    #up_time向后移动命名为up_time_lagged
    temp = df.with_columns(pl.col('up_time').shift().over('id').alias('up_time_lagged'))
    #按秒来计算停歇的时间 time_diff
    temp = temp.with_columns((abs(pl.col('down_time') - pl.col('up_time_lagged')) / 1000).fill_null(0).alias('time_diff'))
    #按空格键,句号,Enter(换行) ,分别是单词,句子,段落之间的标志.
    temp = temp.with_columns((pl.col("up_event") == "Space").alias("is_space"))
    temp = temp.with_columns((pl.col("up_event") == ".").alias("is_dot"))
    temp = temp.with_columns((pl.col("up_event") == "Enter").alias("is_enter"))

    temp = temp.with_columns(
        #根据id计算单词之间的累加和,如果缺失向后填充
        pl.col("is_space").cumsum().shift().backward_fill().over("id").alias("word_id"),
        #根据id计算句子之间的累加和,如果缺失向后填充        
        pl.col("is_dot").cumsum().shift().backward_fill().over("id").alias("sentence_id"),   
        #根据id计算段落之间的累加和,如果缺失向后填充   
        pl.col("is_enter").cumsum().shift().backward_fill().over("id").alias("paragraph_id"),    
    )
    
    #选择 activity为Input和Remove/cut的数据
    temp = temp.filter(pl.col('activity').is_in(['Input', 'Remove/Cut']))

    #All in 全部数据计算time_diff的统计特征
    iw_df = pause_stat_aggregator(df=temp, prefix="iw")

    # Between-words pauses   每个'id'的每个‘word_id’的第一个time_diff  计算统计特征
    bww_df = temp.group_by("id", "word_id").agg(pl.col("time_diff").first())
    bww_df = pause_stat_aggregator(df=bww_df, prefix="bww")

    # Between-sentences pauses 每个'id'的每个‘sentence_id’的第一个time_diff  计算统计特征
    bws_df = temp.group_by("id", "sentence_id").agg(pl.col("time_diff").first())
    bws_df = pause_stat_aggregator(df=bws_df, prefix="bws")

    # Between-paragraphs pauses 每个'id'的每个‘paragraph_id’的第一个time_diff  计算统计特征
    bwp_df = temp.group_by("id", "paragraph_id").agg(pl.col("time_diff").first())
    bwp_df = pause_stat_aggregator(df=bwp_df, prefix="bwp")

    #将所有特征拼接在一起
    feats = (feats.join(iw_df, on="id", how="left")
                 .join(bww_df, on="id", how="left")
                 .join(bws_df, on="id", how="left")
                 .join(bwp_df, on="id", how="left")
            )
    
    feats=feats.to_pandas()
    
    return feats

#统计每秒有几个['Input', 'Remove/Cut']的行为.
def get_keys_pressed_per_second(logs):
    #logs中为['Input', 'Remove/Cut']的event_id的个数
    temp_df = logs[logs['activity'].isin(['Input', 'Remove/Cut'])].groupby(['id']).agg(keys_pressed=('event_id', 'count')).reset_index()
    #每个id最小的down_time和最大的up_time
    temp_df_2 = logs.groupby(['id']).agg(min_down_time=('down_time', 'min'), max_up_time=('up_time', 'max')).reset_index()
    #按照id融合在一起
    temp_df = temp_df.merge(temp_df_2, on='id', how='left')
    #每秒有几个event_id
    temp_df['keys_per_second'] = temp_df['keys_pressed'] / ((temp_df['max_up_time'] - temp_df['min_down_time']) / 1000)
    return temp_df[['id', 'keys_per_second']]

    

def burst_features(df, burst_type="p"):
    #up_time向后移动命名为up_time_lagged
    temp = df.with_columns(pl.col('up_time').shift().over('id').alias('up_time_lagged'))
    #按秒来计算停歇的时间 time_diff
    temp = temp.with_columns((abs(pl.col('down_time') - pl.col('up_time_lagged')) / 1000).fill_null(0).alias('time_diff'))
    #如果P-burst,统计的是activity=Input,如果是R-burst,统计的是Remove/cut
    if burst_type == "p":
        temp = temp.with_columns(pl.col('activity').is_in(['Input']))
    elif burst_type == "r":
        temp = temp.with_columns(pl.col('activity').is_in(['Remove/Cut']))
    #action_time按秒来计算
    temp = temp.with_columns((pl.col('action_time') / 1000).alias("action_time_s"))
    #up_time按照秒来计算
    temp = temp.with_columns((pl.col('up_time') / 1000).alias("up_time_s"))
    #增加f'{burst_type}_burst_group'列,activity!='Input'是nan.
    #activity=='Input',有具体数值,上一个是Input的话,数值一样,上一个是nan的话,在上一次出现‘Input’的数值上+2.
    temp = temp.with_columns(pl.when(pl.col("activity")).then(pl.col("activity").rle_id()).alias(f'{burst_type}_burst_group'))
    #去掉nan的行.就是去掉activity不是'Input'.
    temp = temp.drop_nulls()

    #每个id,每个f"{burst_type}_burst_group" ==1,==3,==5…… 
    temp = temp.group_by("id", f"{burst_type}_burst_group").agg(
                                    #出现了几个activity
                                    pl.count('activity').alias(f'{burst_type}_burst_group_keypress_count'),
                                    #action_time的求和,均值,方差
                                    pl.sum('action_time_s').alias(f'{burst_type}_burst_group_timespent'),
                                    pl.mean('action_time_s').alias(f'{burst_type}_burst_keypress_timespent_mean'),
                                    pl.std('action_time_s').alias(f'{burst_type}_burst_keypress_timespent_std'),
                                    #这个burst_goup的up_time的最小值和最大值,即:第一次和最后一次.
                                    pl.min('up_time_s').alias(f'{burst_type}_burst_keypress_timestamp_first'),
                                    pl.max('up_time_s').alias(f'{burst_type}_burst_keypress_timestamp_last')
    )

    #每个id都有很多f"{burst_type}_burst_group",故还需要求统计特征.
    temp = temp.group_by("id").agg(
                #每个id出现连续的burst_group的求和,均值,方差,最大值.
                pl.sum(f'{burst_type}_burst_group_keypress_count').alias(f'{burst_type}_burst_keypress_count_sum'),
                pl.mean(f'{burst_type}_burst_group_keypress_count').alias(f'{burst_type}_burst_keypress_count_mean'),
                pl.std(f'{burst_type}_burst_group_keypress_count').alias(f'{burst_type}_burst_keypress_count_std'),
                pl.max(f'{burst_type}_burst_group_keypress_count').alias(f'{burst_type}_burst_keypress_count_max'),
                #每个id出现连续的burst_group的action_time的总时间求和,均值,方差,最大值
                pl.sum(f'{burst_type}_burst_group_timespent').alias(f'{burst_type}_burst_timespent_sum'),
                pl.mean(f'{burst_type}_burst_group_timespent').alias(f'{burst_type}_burst_timespent_mean'),
                pl.std(f'{burst_type}_burst_group_timespent').alias(f'{burst_type}_burst_timespent_std'),
                pl.max(f'{burst_type}_burst_group_timespent').alias(f'{burst_type}_burst_timespent_max'),
                #每个id出现连续的burst_group的action_time的平均时间求均值
                pl.mean(f'{burst_type}_burst_keypress_timespent_mean').alias(f'{burst_type}_burst_keypress_timespent_mean'),
                #每个id出现连续的burst_group的action_time的方差求平均
                pl.mean(f'{burst_type}_burst_keypress_timespent_std').alias(f'{burst_type}_burst_keypress_timespent_std'),
                #每个id的burst开始的时间,每个id的burst结束的时间.
                pl.min(f'{burst_type}_burst_keypress_timestamp_first').alias(f'{burst_type}_burst_keypress_timestamp_first'),
                pl.max(f'{burst_type}_burst_keypress_timestamp_last').alias(f'{burst_type}_burst_keypress_timestamp_last')
    )
    
    #将polars对象转成pandas对象.
    temp = temp.to_pandas()
    return temp


def Preprocessor(logs):
    #将数据从pandas对象转成polars对象
    pl_logs = pl.from_pandas(logs)

    print("< Creating keys_pressed_per_second features >")                
    feat_df = get_keys_pressed_per_second(logs)#统计每秒有几个Input或者Remove/cut操作
             
    feat_df = feat_df.merge(dev_feats(df=pl_logs), how="left", on="id") 
    
    print("< Creating PR-Burst features >")  
    #这里是P-burst和R-burst
    feat_df = feat_df.merge(burst_features(df=pl_logs, burst_type="p"), how="left", on="id")    
    feat_df = feat_df.merge(burst_features(df=pl_logs, burst_type="r"), how="left", on="id")    
    
    #获取每个id的论文
    essays = logs.groupby("id").apply(get_Essays)
    essays = pd.DataFrame(essays.tolist(), columns=["id", "essay"])
    essay_feats = TextProcessor(essays)
    feat_df=feat_df.merge(essay_feats,how="left", on="id")
    
    #每秒花费多少时间在P-burst和R-burst上
    feat_df["p_bursts_time_ratio"] = feat_df["p_burst_timespent_sum"] / (feat_df["up_time_max"] / 1000)
    feat_df["r_bursts_time_ratio"] = feat_df["r_burst_timespent_sum"] / (feat_df["up_time_max"] / 1000)
    #action_time在写作总时间的占比情况
    feat_df["action_time_ratio"] = feat_df["action_time_sum"] / feat_df["up_time_max"]
    #所有数据的停顿时间在总时间的占比
    feat_df["pause_time_ratio"] = feat_df["iw_total_pause_time"] / (feat_df["up_time_max"] / 1000)
    #平均每秒发生几次停顿2~3秒
    feat_df["pausecount_time_ratio"] = feat_df["iw_pauses_2_sec"] / (feat_df["up_time_max"] / 1000)
    #平均每秒写多少词数
    feat_df['word_time_ratio'] = feat_df['word_count_max'] / (feat_df["up_time_max"] / 1000)
    #平均每个event写多少词数
    feat_df['word_event_ratio'] = feat_df['word_count_max'] / feat_df["event_id_max"] 
    #平均每秒多少个event
    feat_df['event_time_ratio'] = feat_df['event_id_max']  / (feat_df["up_time_max"] / 1000)
    #文本长度/秒数,即:每秒写多少文本.
    feat_df["text_length_time_ratio"] = feat_df["text_length"] / (feat_df["up_time_max"] / 1000)
    
    return feat_df

train_logs=pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv")
print(f"len(train_logs):{len(train_logs)}")
train_logs=train_logs.sort_values(by=['id', 'down_time'])
# 重置索引
train_logs = train_logs.reset_index(drop=True)
# 根据'id'列进行分组，并为每个分组添加一个递增的序列
train_logs['event_id'] = train_logs.groupby('id').cumcount() + 1

train_scores=pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv")

test_logs=pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv")
print(f"len(test_logs):{len(test_logs)}")
test_logs=test_logs.sort_values(by=['id', 'down_time'])
# 重置索引
test_logs = test_logs.reset_index(drop=True)
# 根据'id'列进行分组，并为每个分组添加一个递增的序列
test_logs['event_id'] = test_logs.groupby('id').cumcount() + 1

print("feature engineer")
train_feats = Preprocessor(train_logs)
train_feats = train_feats.merge(train_scores, how="left", on="id")
test_feats = Preprocessor(test_logs)

#找到只有唯一值的列,删掉
keys=train_feats.keys().values
unique_cols=[key for key in keys if train_feats[key].nunique()<2]
print(f"drop unique_cols:{unique_cols}")
train_feats = train_feats.drop(columns=unique_cols)
test_feats = test_feats.drop(columns=unique_cols)
#将里面正无穷和负无穷的值替换成缺失值
train_feats.replace([np.inf, -np.inf], np.nan, inplace=True)
test_feats.replace([np.inf, -np.inf], np.nan, inplace=True)
train_feats.drop(['id'],axis=1,inplace=True)
print(f"total_feats_counts:{len(test_feats.keys().values)}")


#这里创建了lgbm,cat,xgb模型
def make_model():
    
    #大佬找好的参数,这里不做改动
    cat_params = {'learning_rate': 0.024906985231770738, 'depth': 5, 
                  'l2_leaf_reg': 3.7139894959529283, 'subsample': 0.18527466886647015, 
                  'colsample_bylevel': 0.6552973951000719, 'min_data_in_leaf': 93,
                "silent": True,"iterations": 1000, "random_state": seed,"use_best_model":False
                 }
    
    lgb_params={'reg_alpha': 1.0894488472899402, 'reg_lambda': 6.290929934336985,
                'colsample_bytree': 0.6218522907548012, 'subsample': 0.9579924238280629, 
                'learning_rate': 0.0027076430412427566, 'max_depth': 8, 'num_leaves': 947, 
                 'min_child_samples': 57,'n_estimators': 2500,'metric': 'rmse',
                'random_state': seed,'verbosity': -1,'force_col_wise': True
                }
    
    xgb_params={'max_depth': 2, 'learning_rate': 0.009998236038809146,
                'n_estimators': 1000, 'min_child_weight': 17,
                'gamma': 0.1288249858838246, 'subsample': 0.5078057280148618,
                'colsample_bytree': 0.7355762136239921, 'reg_alpha': 0.670956206987811,
                'reg_lambda': 0.06818351284100388, 'random_state': seed
               }
    
    model1 = LGBMRegressor(**lgb_params)
    
    model2 = CatBoostRegressor(**cat_params)
    
    model3 = XGBRegressor(**xgb_params)
    
    models = []
    models.append((model1, 'lgb'))
    models.append((model2, 'cat'))
    models.append((model3, 'xgb'))
    
    return models
#评估指标是RMSE
def RMSE(y_true,y_pred):
    return np.sqrt(np.mean((y_true-y_pred)**2))

X=train_feats.drop(['score'],axis=1)
y=train_feats['score']

models_and_errors_dict = {}

y_hats = dict()

#设置submission_df,id和score
submission_df = pd.DataFrame(test_feats['id'])
submission_df['score'] = 3.5#如果报错,将预测结果设置为3.5

#取出test_feats中所有列
X_unseen = test_feats.drop(['id'],axis=1).copy()

num_folds=10

for model, model_type in make_model():
    
    oof_pred=np.zeros((len(y)))
    
    y_hats[model_type] = []#某个model的预测结果
        
    #10折交叉验证
    skf = StratifiedKFold(n_splits=num_folds,random_state=seed, shuffle=True)

    for fold, (train_index, valid_index) in (enumerate(skf.split(X, y.astype(str)))):

        # Split data into train and test sets
        X_train, X_test = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_test = y.iloc[train_index], y.iloc[valid_index]

        X_train_copy, X_test_copy = X_train.copy(), X_test.copy()

        model.fit(X_train_copy, y_train)

        #做预测
        y_hat = model.predict(X_test_copy)
        
        oof_pred[valid_index]=y_hat
        #计算RMSE
        rmse = RMSE(y_test, y_hat)
        print(f'RMSE: {rmse} on fold {fold}')
        
        
        #复制是因为有的要归一化
        X_unseen_copy = X_unseen.copy()
        X_unseen_copy=X_unseen_copy
        
        y_hats[model_type].append(model.predict(X_unseen_copy))
        #在字典里没有就创建一个
        if model_type not in models_and_errors_dict:
            models_and_errors_dict[model_type] = []
        models_and_errors_dict[model_type].append((model, rmse, None, None,oof_pred))
for key in y_hats.keys():
    #如果有值的话,求平均,赋值给submission_df
    if y_hats[key]:
        y_hat_avg = np.mean(y_hats[key], axis=0)
        submission_df['score_' + key] = y_hat_avg
submission_df.head()


#融合模型的权重
blending_weights = {
    'lgb': 0.4,
    'cat': 0.4,
    'xgb': 0.2,
}
lgb_oof_pred=models_and_errors_dict['lgb'][num_folds-1][4]
cat_oof_pred=models_and_errors_dict['cat'][num_folds-1][4]
xgb_oof_pred=models_and_errors_dict['xgb'][num_folds-1][4]
margin=1000
target=y.values
current_RMSE=RMSE(target,(lgb_oof_pred+cat_oof_pred+xgb_oof_pred)/3)
best_i=0
best_j=0
for i in range(0,margin):
    for j in range(0,margin-i):
        #k=1000-i-j
        blend_oof_pred=(i*lgb_oof_pred+j*cat_oof_pred+(margin-i-j)*xgb_oof_pred)/margin
        if RMSE(target,blend_oof_pred)<current_RMSE:
            current_RMSE=RMSE(target,blend_oof_pred)
            best_i=i
            best_j=j
#找到最好的参数之后
blending_weights['lgb']=best_i/margin
blending_weights['cat']=best_j/margin
blending_weights['xgb']=(margin-best_i-best_j)/margin
print(f"current_RMSE:{current_RMSE},blending_weights:{blending_weights}")
print("blending")
blended_score=np.zeros((len(test_feats)))
for k, v in blending_weights.items():
    blended_score += submission_df['score_' + k] * v
print(f"blended_score:{blended_score}")


submission=pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/sample_submission.csv")
submission['score']=blended_score
submission.to_csv("submission.csv",index=None)
submission.head()
LGBMRegressor CatBoostRegressor XGBRegressor回归

相关文章

【Unity3D】实现横版2D游戏——单向平台（简易版）

深度学习篇---深度学习框架

TCL C++开发面试题及参考答案

了解传输层UDP协议

pytorch深度Q网络

2025数学建模美赛|赛题翻译|E题

双写+灰度发布：高并发场景下的维度表拆分零事故迁移实践

仿真设计|基于51单片机的温度与烟雾报警系统