mingancijishu

'''
    -*- conding: utf-8 -*-
    统计生产数据中的话术中各敏感词出现的频率。
'''


import pandas as pd
from functools import cmp_to_key
from datetime import datetime
from tqdm import tqdm


starttime = datetime.now()
infile1 = "./input/input1/" + "生产数据.xlsx"
infile2 = "./input/input1/" + "敏感词.xlsx"
outfile = "./output/output1/" + "统计结果.xlsx"

df1 = pd.read_excel(infile1)
df2 = pd.read_excel(infile2).set_index('user_say')
print("\n数据读取完成")

def compare(a, b):
    if len(a) > len(b):
        return 1
    elif len(a) < len(b):
        return -1
    else:
        return 0

grouped = df1.groupby('匹配扩展问')
df_out = pd.DataFrame()
for user_say2, data in tqdm(grouped):
    new_row = [user_say2]
    sensitive_words = df2.loc[user_say2].dropna()
    sensitive_words = sensitive_words.tolist()
    sensitive_words.sort(key=cmp_to_key(compare), reverse=True)
    for word in sensitive_words:
        count = 0
        for i in range(len(data)):
            user_say1 = data.iloc[i]['会话内容']
            if word in user_say1:
                count += 1
                user_say1 = user_say1.replace(word, '')
                data.iloc[i]['会话内容'] = user_say1
        new_row.append(word)
        new_row.append(str(count))
    new_row = pd.DataFrame([new_row])
    df_out = df_out.append(new_row)
print("\n敏感词统计完成")

df_out.reset_index(drop=True, inplace=True)
df_out.to_excel(outfile)
print("\n用时:", datetime.now() - starttime)
全部评论

相关推荐

点赞 收藏 评论
分享
牛客网
牛客企业服务