上篇: 用go构建个简单的搜索(七) BM25打分
流程概述
graph TD
A(加载文档) --> b[切词]
b[切词] --> c[统计文档和词频关系]
d[BM25关键词打分] --> e[打分排序]
f[关键词查询]
当前处理点和遇到的问题
- demo是用map处理的,map好处是可以用到k-v数据库作为持久化
- go的map根据value排序 map[int]float32=》文档:得分
- go中map的v是结构体数据修改
困惑的地方
- 文档是不停增长的,但是BM25打分依赖文档总数和至少出现一次的关键词总数 该如何处理比较好点
- 如何定义一个父类的结构体,实现自定义的子结构体数据处理
测试效果

代码demo
type docWord struct {
word_s string
word_tf map[int]int
word_score map[int]float32
score_sort []scoreSort
}
type scoreSort struct {
id int
score float32
}
func TestBM26(t *testing.T) {
str, isTrue := loadFileByLine("D:/开发语言/文章/小说/活着2.txt")
if isTrue {
avgLen := getAVG(str)
var docWordArry = make(map[string]docWord)
for i, s := range str {
wordTF := getSplicWordTF(s)
for word := range wordTF {
_, ok := docWordArry[word]
if ok {
docWordArry[word].word_tf[i]++
} else {
docWordArry[word] = docWord{
word_s: word,
word_tf: make(map[int]int),
word_score: make(map[int]float32),
score_sort: make([]scoreSort, 0),
}
docWordArry[word].word_tf[i] = 1
}
}
}
for w := range docWordArry {
dw := docWordArry[w]
for index := range dw.word_tf {
dw.word_score[index] = BM25(float32(dw.word_tf[index]), float32(len([]rune(str[index]))), avgLen, float32(len(str)), float32(len(dw.word_tf)))
dw.score_sort = append(dw.score_sort, scoreSort{index, dw.word_score[index]})
}
sort.Slice(dw.score_sort, func(i, j int) bool {
return dw.score_sort[i].score > dw.score_sort[j].score
})
docWordArry[w] = dw
}
search := docWordArry["福贵"]
for _, index := range search.score_sort {
fmt.Printf("%s\n文档ID:%d,词:%s,打分:%f\n", str[index.id], index.id, search.word_s, search.word_score[index.id])
}
}
}