import numpy as np defencode(featureset, label, mapping): encoding = [] for (fname, fval) in featureset.items(): if(fname,fval,label) in mapping: encoding.append((mapping[(fname,fval,label)],1)) return encoding
# 求解∑x,yfi(x,y) defcalculate_empirical_fcount(train_toks, mapping): fcount = np.zeros(len(mapping)) for tok, label in train_toks: for(index, val) in encode(tok,label,mapping): fcount[index] += val return fcount
# 求解P(y|x) defprob(tok, labels, mapping, weights): prob_dict = {} for label in labels: total = 0.0 for(index,val) in encode(tok,label,mapping): total += weights[index]*val prob_dict[label] = np.exp(total) value_sum = sum(list(prob_dict.values())) for(label, value) in prob_dict.items(): prob_dict[label] = prob_dict[label]/value_sum return prob_dict
# 求解∑x,yP(y|x)fi(x,y) defcalculate_estimated_fcount(train_toks, mapping, labels, weights): fcount = np.zeros(len(mapping)) for tok, label in train_toks: prob_dict = prob(tok,labels,mapping,weights) for label, p in prob_dict.items(): for (index, val) in encode(tok, label, mapping): fcount[index] += p*val return fcount