new submissions
This commit is contained in:
Родитель
d66a7639c3
Коммит
bfdd802d20
|
@ -0,0 +1 @@
|
|||
__author__ = 'vrama91'
|
|
@ -0,0 +1,105 @@
|
|||
#!/usr/bin/env python
|
||||
#
|
||||
# File Name : rouge.py
|
||||
#
|
||||
# Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
|
||||
#
|
||||
# Creation Date : 2015-01-07 06:03
|
||||
# Author : Ramakrishna Vedantam <vrama91@vt.edu>
|
||||
|
||||
import numpy as np
|
||||
import pdb
|
||||
|
||||
def my_lcs(string, sub):
|
||||
"""
|
||||
Calculates longest common subsequence for a pair of tokenized strings
|
||||
:param string : list of str : tokens from a string split using whitespace
|
||||
:param sub : list of str : shorter string, also split using whitespace
|
||||
:returns: length (list of int): length of the longest common subsequence between the two strings
|
||||
|
||||
Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
|
||||
"""
|
||||
if(len(string)< len(sub)):
|
||||
sub, string = string, sub
|
||||
|
||||
lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]
|
||||
|
||||
for j in range(1,len(sub)+1):
|
||||
for i in range(1,len(string)+1):
|
||||
if(string[i-1] == sub[j-1]):
|
||||
lengths[i][j] = lengths[i-1][j-1] + 1
|
||||
else:
|
||||
lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])
|
||||
|
||||
return lengths[len(string)][len(sub)]
|
||||
|
||||
class Rouge():
|
||||
'''
|
||||
Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
|
||||
|
||||
'''
|
||||
def __init__(self):
|
||||
# vrama91: updated the value below based on discussion with Hovey
|
||||
self.beta = 1.2
|
||||
|
||||
def calc_score(self, candidate, refs):
|
||||
"""
|
||||
Compute ROUGE-L score given one candidate and references for an image
|
||||
:param candidate: str : candidate sentence to be evaluated
|
||||
:param refs: list of str : COCO reference sentences for the particular image to be evaluated
|
||||
:returns score: int (ROUGE-L score for the candidate evaluated against references)
|
||||
"""
|
||||
assert(len(candidate)==1)
|
||||
assert(len(refs)>0)
|
||||
prec = []
|
||||
rec = []
|
||||
|
||||
# split into tokens
|
||||
token_c = candidate[0].split(" ")
|
||||
|
||||
for reference in refs:
|
||||
# split into tokens
|
||||
token_r = reference.split(" ")
|
||||
# compute the longest common subsequence
|
||||
lcs = my_lcs(token_r, token_c)
|
||||
prec.append(lcs/float(len(token_c)))
|
||||
rec.append(lcs/float(len(token_r)))
|
||||
|
||||
prec_max = max(prec)
|
||||
rec_max = max(rec)
|
||||
|
||||
if(prec_max!=0 and rec_max !=0):
|
||||
score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max)
|
||||
else:
|
||||
score = 0.0
|
||||
return score
|
||||
|
||||
def compute_score(self, gts, res):
|
||||
"""
|
||||
Computes Rouge-L score given a set of reference and candidate sentences for the dataset
|
||||
Invoked by evaluate_captions.py
|
||||
:param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values
|
||||
:param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
|
||||
:returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
|
||||
"""
|
||||
assert(gts.keys() == res.keys())
|
||||
imgIds = gts.keys()
|
||||
|
||||
score = []
|
||||
for id in imgIds:
|
||||
hypo = res[id]
|
||||
ref = gts[id]
|
||||
|
||||
score.append(self.calc_score(hypo, ref))
|
||||
|
||||
# Sanity check.
|
||||
assert(type(hypo) is list)
|
||||
assert(len(hypo) == 1)
|
||||
assert(type(ref) is list)
|
||||
assert(len(ref) > 0)
|
||||
|
||||
average_score = np.mean(np.array(score))
|
||||
return average_score, np.array(score)
|
||||
|
||||
def method(self):
|
||||
return "Rouge"
|
|
@ -0,0 +1,22 @@
|
|||
QnA Metrics
|
||||
{'testlen': 595574, 'reflen': 965609, 'guess': [595574, 539985, 484398, 428846], 'correct': [262038, 105860, 62640, 41489]}
|
||||
ratio: 0.6167858833130173
|
||||
############################
|
||||
F1: 0.7095818893165093
|
||||
bleu_1: 0.23637297986544653
|
||||
bleu_2: 0.15778243503336944
|
||||
bleu_3: 0.12003613413452242
|
||||
bleu_4: 0.09737202953134436
|
||||
rouge_l: 0.30710908155907457
|
||||
############################
|
||||
NLGEN Metrics
|
||||
{'testlen': 179238, 'reflen': 185678, 'guess': [179238, 162761, 146284, 129817], 'correct': [81382, 44263, 28435, 18526]}
|
||||
ratio: 0.9653163002617383
|
||||
############################
|
||||
F1: 1.0
|
||||
bleu_1: 0.43802018103794066
|
||||
bleu_2: 0.3389926123404134
|
||||
bleu_3: 0.278277167790246
|
||||
bleu_4: 0.23338324092925958
|
||||
rouge_l: 0.37453516033239237
|
||||
############################
|
|
@ -0,0 +1,22 @@
|
|||
QnA Metrics
|
||||
{'testlen': 775580, 'reflen': 966391, 'guess': [775580, 719991, 664402, 608813], 'correct': [407563, 250275, 193605, 159369]}
|
||||
ratio: 0.802553003908355
|
||||
############################
|
||||
F1: 0.7095818893165093
|
||||
bleu_1: 0.4108860984489985
|
||||
bleu_2: 0.3341816312622258
|
||||
bleu_3: 0.29412686598173166
|
||||
bleu_4: 0.26863916497393286
|
||||
rouge_l: 0.4339947235728949
|
||||
############################
|
||||
NLGEN Metrics
|
||||
{'testlen': 229910, 'reflen': 188685, 'guess': [229910, 213433, 196956, 180479], 'correct': [106372, 69349, 51811, 40179]}
|
||||
ratio: 1.2184858361819901
|
||||
############################
|
||||
F1: 1.0
|
||||
bleu_1: 0.4626680005219414
|
||||
bleu_2: 0.3877252179823093
|
||||
bleu_3: 0.3406959353239066
|
||||
bleu_4: 0.30631504351834743
|
||||
rouge_l: 0.43941356310975876
|
||||
############################
|
Загрузка…
Ссылка в новой задаче