- 任务: 用逻辑回归做二分类的意图判定。输入是一串仅由大写字母A~G组成的字符串;输出是标签0或1。
- 特征: 对每条字符串做7维one-hot存在编码,顺序固定为A B C D E F G;某字母出现过则该维取1,否则取0。
- 模型: 单层逻辑回归,权重w和偏置b初始为0;激活用sigmoid;损失为二分类交叉熵;优化用学习率0.1、轮数20、batch size=1 的梯度下降;最终预测阈值0.5,大于阈值判1,否则判0。
5 2 ABC 1 ADG 1 BE 1 CFG 1 ABCFG 1 A BG
1 1
训练集中所有标注为1,从w=b=0开始,梯度会把z推大,使预测逐步超过0.5,因而对任意测试串都输出1。
import math
def sigmoid(x):
return 1 / (1 + math.exp(-x))
class BinGraph:
def __init__(self):
self.w = [0.0] * 7
self.b = 0.0
return
def train(self, N, trn, lbl, total_epoch=20, learning_rate=0.1):
for _ in range(total_epoch):
for ii in range(N):
x = trn[ii]
pre = self.forward(x)
grad = pre - lbl[ii]
self.b -= grad * learning_rate
for jj in range(7):
self.w[jj] -= grad * x[jj] * learning_rate
return
def forward(self, x):
re = self.b
for ii in range(7):
re += self.w[ii] * x[ii]
re = sigmoid(re)
return re
def str2num(s):
l = [0] * 7
for c in s:
c_id = ord(c) - ord('A')
l[c_id] = 1
l = tuple(l)
return l
def read_in():
N, M = map(int, input().split())
trn = []
lbl = []
for _ in range(N):
read_line = input().split()
trn.append(str2num(read_line[0]))
lbl.append(float(read_line[1]))
trn = tuple(trn)
lbl = tuple(lbl)
return N, M, trn, lbl
if __name__ == '__main__':
read_in = read_in()
bg = BinGraph()
bg.train(read_in[0], read_in[2], read_in[3])
for _ in range(read_in[1]):
tst = str2num(input())
pred = bg.forward(tst)
if pred > 0.5:
print(1)
else:
print(0)