WikiCommentEdit/eval.py

336 строки
15 KiB
Python

import math
from statistics import mean
import sklearn
import sklearn.metrics
import torch
from process_data import _make_char_vector, _make_word_vector, gen_cmntrank_batches, gen_editanch_batches
from process_data import to_var, make_vector
from wiki_util import tokenizeText
## general evaluation
def predict(pred_cmnt, pred_ctx, w2i, c2i, model, max_ctx_length):
print("prediction on single sample ...")
model.eval()
_, pred_cmnt_words = tokenizeText(pred_cmnt)
pred_cmnt_chars = [list(i) for i in pred_cmnt_words]
_, pred_ctx_words = tokenizeText(pred_ctx)
pred_ctx_chars = [list(i) for i in pred_ctx_words]
cmnt_sent_len = len(pred_cmnt_words)
cmnt_word_len = int(mean([len(w) for w in pred_cmnt_chars]))
ctx_sent_len = max_ctx_length
ctx_word_len = int(mean([len(w) for w in pred_ctx_chars]))
cmnt_words, cmnt_chars, ctx_words, ctx_chars = [], [], [], []
# c, cc, q, cq, a in batch
cmnt_words.append(_make_word_vector(pred_cmnt_words, w2i, cmnt_sent_len))
cmnt_chars.append(_make_char_vector(pred_cmnt_chars, c2i, cmnt_sent_len, cmnt_word_len))
ctx_words.append(_make_word_vector(pred_ctx_words, w2i, ctx_sent_len))
ctx_chars.append(_make_char_vector(pred_ctx_chars, c2i, ctx_sent_len, ctx_word_len))
cmnt_words = to_var(torch.LongTensor(cmnt_words))
cmnt_chars = to_var(torch.stack(cmnt_chars, 0))
ctx_words = to_var(torch.LongTensor(ctx_words))
ctx_chars = to_var(torch.stack(ctx_chars, 0))
logit, _ = model(ctx_words, ctx_chars, cmnt_words, cmnt_chars)
a = torch.max(logit.cpu(), -1)
print(logit)
print(a)
y_pred = a[1].data[0]
y_prob = a[0].data[0]
y_pred_2 = [int(i) for i in (torch.max(logit, -1)[1].view(1).data).tolist()][0]
print(y_pred_2)
# y_pred = y_pred[0]
print(y_pred, y_prob)
return y_pred
def compute_rank_score(pos_score, neg_scores):
p1, p3, p5 = 0, 0, 0
pos_list = [0 if pos_score > neg_score else 1 for neg_score in neg_scores]
pos = sum(pos_list)
# precision @K
if pos == 0: p1 = 1
if pos < 3: p3 = 1
if pos < 5: p5 = 1
# MRR
mrr = 1 / (pos + 1)
# NDCG: DCG/IDCG (In our case, we set the rel=1 if relevent, otherwise rel=0; Then IDCG=1)
ndcg = 1 / math.log2(pos + 2)
return p1, p3, p5, mrr, ndcg
def get_rank(pos_score, neg_scores):
pos_list = [0 if pos_score > neg_score else 1 for neg_score in neg_scores]
pos = sum(pos_list)
return pos + 1
def isEditPredCorrect(pred, truth):
for i in len(pred):
if pred[i] != truth[i]:
return False
return True
def eval_rank(score_pos, score_neg, cand_num):
score_pos_list = score_pos.data.cpu().squeeze(1).numpy().tolist()
score_neg_list = score_neg.data.cpu().squeeze(1).numpy().tolist()
correct_p1, correct_p3, correct_p5, total_mrr, total_ndcg = 0, 0, 0, 0, 0
neg_num = cand_num - 1
batch_num = int(len(score_neg) / neg_num)
rank_list = []
for i in range(batch_num):
score_pos_i = score_pos_list[i * neg_num: (i + 1) * neg_num]
score_neg_i = score_neg_list[i * neg_num: (i + 1) * neg_num]
p1, p3, p5, mrr, ndcg = compute_rank_score(score_pos_i[0], score_neg_i)
rank = get_rank(score_pos_i[0], score_neg_i)
rank_list.append(rank)
correct_p1 += p1
correct_p3 += p3
correct_p5 += p5
total_mrr += mrr
total_ndcg += ndcg
return correct_p1, correct_p3, correct_p5, total_mrr, total_ndcg, rank_list
# def eval_rank_orig(score_pos, score_neg, batch_size):
# score_pos_list = score_pos.data.cpu().squeeze(1).numpy().tolist()
# score_neg_list = score_neg.data.cpu().squeeze(1).numpy().tolist()
#
# total_p1, total_p3, total_p5, total_mrr, total_ndcg = 0, 0, 0, 0, 0
# sample_num = int(len(score_neg) / batch_size)
# for i in range(batch_size):
# score_pos_i = score_pos_list[i * sample_num: (i+1) * sample_num]
# score_neg_i = score_neg_list[i * sample_num: (i+1) * sample_num]
# pos_list = [0 if score_pos_i[i] >= score_neg_i[i] else 1 for i in range(sample_num)]
# pos = sum(pos_list)
# sorted_neg = ["%.4f" % i for i in sorted(score_neg_i, reverse=True)]
# #print(pos, "%.4f" % score_pos_i[0], "\t".join(sorted_neg), sep='\t')
# if pos == 0:
# total_p1 += 1
#
# if pos < 3:
# total_p3 += 1
#
# if pos < 5:
# total_p5 += 1
#
# # MRR
# total_mrr += 1 / (pos + 1)
#
# # NDCG: DCG/IDCG (In our case, we set the rel=1 if relevent, otherwise rel=0; Then IDCG=1)
# total_ndcg += 1 / math.log2(pos + 2)
#
# return total_p1, total_p3, total_p5, total_mrr, total_ndcg
## general evaluation
def eval(dataset, val_df, w2i, model, args):
# print(" evaluation on", val_df.shape[0], " samples ...")
model.eval()
corrects, avg_loss = 0, 0
cmnt_rank_p1, cmnt_rank_p3, cmnt_rank_p5, cmnt_rank_mrr, cmnt_rank_ndcg = 0, 0, 0, 0, 0
ea_pred, ea_truth = [], []
cr_total, ea_total = 0, 0
pred_cr_list, pred_ea_list = [], []
for batch in dataset.iterate_minibatches(val_df, args.batch_size):
cmnt_sent_len = args.max_cmnt_length
ctx_sent_len = args.max_ctx_length
diff_sent_len = args.max_diff_length
###########################################################
# Comment Ranking Task
###########################################################
# generate positive and negative batches
pos_batch, neg_batch = gen_cmntrank_batches(batch, w2i, cmnt_sent_len, diff_sent_len, ctx_sent_len,
args.rank_num)
pos_cmnt, pos_src_token, pos_src_action, pos_tgt_token, pos_tgt_action = \
make_vector(pos_batch, w2i, cmnt_sent_len, ctx_sent_len)
neg_cmnt, neg_src_token, neg_src_action, neg_tgt_token, neg_tgt_action = \
make_vector(neg_batch, w2i, cmnt_sent_len, ctx_sent_len)
score_pos, _ = model(pos_cmnt, pos_src_token, pos_src_action, pos_tgt_token, pos_tgt_action, cr_mode=True)
score_neg, _ = model(neg_cmnt, neg_src_token, neg_src_action, neg_tgt_token, neg_tgt_action, cr_mode=True)
cr_p1_corr, cr_p3_corr, cr_p5_corr, cr_mrr, cr_ndcg, pred_rank = eval_rank(score_pos, score_neg, args.rank_num)
cmnt_rank_p1 += cr_p1_corr
cmnt_rank_p3 += cr_p3_corr
cmnt_rank_p5 += cr_p5_corr
cmnt_rank_mrr += cr_mrr
cmnt_rank_ndcg += cr_ndcg
cr_total += int(len(score_pos) / (args.rank_num - 1))
pred_cr_list += pred_rank
###########################################################
# Edits Anchoring
###########################################################
# generate positive and negative batches
ea_batch, ea_truth_cur = gen_editanch_batches(batch, w2i, cmnt_sent_len, diff_sent_len, ctx_sent_len,
args.anchor_num)
if len(pos_batch[0]) > 0:
cmnt, src_token, src_action, tgt_token, tgt_action = \
make_vector(ea_batch, w2i, cmnt_sent_len, ctx_sent_len)
# neg_cmnt, neg_src_token, neg_src_action, neg_tgt_token, neg_tgt_action = \
# make_vector(neg_batch, w2i, cmnt_sent_len, ctx_sent_len)
logit, _ = model(cmnt, src_token, src_action, tgt_token, tgt_action, cr_mode=False)
# logit_neg, _ = model(neg_cmnt, neg_src_token, neg_src_action, neg_tgt_token, neg_tgt_action, cr_mode=False)
ea_pred_cur = (torch.max(logit, 1)[1].view(logit.size(0)).data).tolist()
# ea_truth_cur = [1] * logit_pos.size(0) + [0] * logit_neg.size(0)
ea_pred += ea_pred_cur
ea_truth += ea_truth_cur
ea_total += int(len(score_pos) / (args.anchor_num - 1))
# # output the prediction results
# with open(args.checkpoint_path + 'test_out.txt', 'w') as f:
# for i in range(len(y_truth)):
# line = cmnt_readable_all[i] + '\t' + ctx_readable_all[i] + '\t' + str(y_pred[i]) + '\t' + str(y_truth[i])
# f.write(line + '\n')
# if args.test:
# print(total_rank)
# print("\t".join([str(i) for i in pred_cr_list]))
# print("\t".join([str(i) for i in ea_pred]))
cr_p1_acc = cmnt_rank_p1 / cr_total
cr_p3_acc = cmnt_rank_p3 / cr_total
cr_p5_acc = cmnt_rank_p5 / cr_total
cr_mrr = cmnt_rank_mrr / cr_total
cr_ndcg = cmnt_rank_ndcg / cr_total
ea_acc = (sklearn.metrics.accuracy_score(ea_truth, ea_pred))
ea_f1 = (sklearn.metrics.f1_score(ea_truth, ea_pred, pos_label=1))
ea_prec = (sklearn.metrics.precision_score(ea_truth, ea_pred, pos_label=1))
ea_recall = (sklearn.metrics.recall_score(ea_truth, ea_pred, pos_label=1))
print("\n*** Validation Results *** ")
# print("[Task-CR] P@1:", "%.3f" % cr_p1_acc, "% P@3:", "%.3f" % cr_p3_acc, "% P@5:", "%.3f" % cr_p5_acc,\
# '%', ' (', cmnt_rank_p1, '/', cr_total, ',', cmnt_rank_p3, '/', cr_total, ',', cmnt_rank_p5, '/', cr_total,')', sep='')
# print("[Task-RA] P@1:", "%.3f" % ea_p1_acc, "% P@3:", "%.3f" % ea_p3_acc, "% P@5:", "%.3f" % ea_p5_acc,\
# '%', ' (', edit_anch_p1, '/', ea_total, ',', edit_anch_p3, '/', ea_total, ',', edit_anch_p5, '/', ea_total,')', sep='')
print("[Task-CR] P@1:", "%.3f" % cr_p1_acc, " P@3:", "%.3f" % cr_p3_acc, " P@5:", "%.3f" % cr_p5_acc, " MRR:",
"%.3f" % cr_mrr, " NDCG:", "%.3f" % cr_ndcg, sep='')
print("[Task-EA] ACC:", "%.3f" % ea_acc, " F1:", "%.3f" % ea_f1, " Precision:", "%.3f" % ea_prec, " Recall:",
"%.3f" % ea_recall, sep='')
return cr_p1_acc, ea_f1
def dump_cmntrank_case(pos_batch, neg_batch, idx, rank_num, diff_url, rank, pos_score, neg_scores):
neg_num = rank_num - 1
pos_cmnt = pos_batch[0][idx * neg_num]
neg_cmnts = neg_batch[0][idx * neg_num: (idx + 1) * neg_num]
before_edit = pos_batch[1][idx * neg_num]
after_edit = pos_batch[3][idx * neg_num]
match = False
for token in pos_cmnt:
if token in before_edit + after_edit:
match = True
break
neg_match_words = []
neg_match = False
for neg_cmnt in neg_cmnts:
for token in neg_cmnt:
if len(token) <= 3:
continue
if token in before_edit + after_edit:
neg_match = True
neg_match_words.append(token)
if not match and neg_match:
print("\n ====== cmntrank case (Not Matched) ======")
print("Rank", rank)
print(diff_url)
print("pos_cmnt (", "{0:.3f}".format(pos_score), "): ", " ".join(pos_cmnt), sep='')
for i, neg_cmnt in enumerate(neg_cmnts):
print("neg_cmnt ", i, " (", "{0:.3f}".format(neg_scores[i]), "): ", " ".join(neg_cmnt), sep='')
pass
print("neg_match_words:", " ".join(neg_match_words))
def dump_editanch_case(comment, edit, pred, truth):
print("\n ====== editanch case ======")
print("pred/truth: ", pred, "/", truth)
print("comment:", " ".join(comment))
print("edit:", " ".join(edit))
def case_study(dataset, val_df, w2i, model, args):
model.eval()
print("Start the case study")
# for batch in dataset.iterate_minibatches(val_df[:500], args.batch_size):
for batch in dataset.iterate_minibatches(val_df, args.batch_size):
cmnt_sent_len = args.max_cmnt_length
ctx_sent_len = args.max_ctx_length
diff_sent_len = args.max_diff_length
###########################################################
# Comment Ranking Task
###########################################################
# generate positive and negative batches
if args.cr_train:
pos_batch, neg_batch = gen_cmntrank_batches(batch, w2i, cmnt_sent_len, diff_sent_len, ctx_sent_len,
args.rank_num)
pos_cmnt, pos_src_token, pos_src_action, pos_tgt_token, pos_tgt_action = \
make_vector(pos_batch, w2i, cmnt_sent_len, ctx_sent_len)
neg_cmnt, neg_src_token, neg_src_action, neg_tgt_token, neg_tgt_action = \
make_vector(neg_batch, w2i, cmnt_sent_len, ctx_sent_len)
score_pos, _ = model(pos_cmnt, pos_src_token, pos_src_action, pos_tgt_token, pos_tgt_action, cr_mode=True)
score_neg, _ = model(neg_cmnt, neg_src_token, neg_src_action, neg_tgt_token, neg_tgt_action, cr_mode=True)
score_pos_list = score_pos.data.cpu().squeeze(1).numpy().tolist()
score_neg_list = score_neg.data.cpu().squeeze(1).numpy().tolist()
neg_num = args.rank_num - 1
batch_num = int(len(score_neg) / neg_num)
for i in range(batch_num):
score_pos_i = score_pos_list[i * neg_num: (i + 1) * neg_num]
score_neg_i = score_neg_list[i * neg_num: (i + 1) * neg_num]
rank = get_rank(score_pos_i[0], score_neg_i)
dump_cmntrank_case(pos_batch, neg_batch, i, args.rank_num, batch[8][i], rank, score_pos_i[0],
score_neg_i)
###########################################################
# Edits Anchoring
###########################################################
# generate positive and negative batches
if args.ea_train:
ea_batch, ea_truth_cur = gen_editanch_batches(batch, w2i, cmnt_sent_len, diff_sent_len, ctx_sent_len,
args.anchor_num)
cmnt, src_token, src_action, tgt_token, tgt_action = \
make_vector(ea_batch, w2i, cmnt_sent_len, ctx_sent_len)
# neg_cmnt, neg_src_token, neg_src_action, neg_tgt_token, neg_tgt_action = \
# make_vector(neg_batch, w2i, cmnt_sent_len, ctx_sent_len)
logit, _ = model(cmnt, src_token, src_action, tgt_token, tgt_action, cr_mode=False)
# logit_neg, _ = model(neg_cmnt, neg_src_token, neg_src_action, neg_tgt_token, neg_tgt_action, cr_mode=False)
ea_pred_cur = (torch.max(logit, 1)[1].view(logit.size(0)).data).tolist()
for i in range(len(ea_truth_cur)):
# if ea_pred_cur[i] == ea_truth_cur[i]:
dump_editanch_case(ea_batch[0][i], ea_batch[3][i], ea_pred_cur[i], ea_truth_cur[i])
pass
# ea_truth_cur = [1] * logit_pos.size(0) + [0] * logit_neg.size(0)