diff --git a/Evaluation/rouge_metric/__init__.py b/Evaluation/rouge_metric/__init__.py new file mode 100644 index 0000000..43a773e --- /dev/null +++ b/Evaluation/rouge_metric/__init__.py @@ -0,0 +1 @@ +__author__ = 'vrama91' diff --git a/Evaluation/rouge_metric/rouge.py b/Evaluation/rouge_metric/rouge.py new file mode 100644 index 0000000..3a10f5a --- /dev/null +++ b/Evaluation/rouge_metric/rouge.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python +# +# File Name : rouge.py +# +# Description : Computes ROUGE-L metric as described by Lin and Hovey (2004) +# +# Creation Date : 2015-01-07 06:03 +# Author : Ramakrishna Vedantam + +import numpy as np +import pdb + +def my_lcs(string, sub): + """ + Calculates longest common subsequence for a pair of tokenized strings + :param string : list of str : tokens from a string split using whitespace + :param sub : list of str : shorter string, also split using whitespace + :returns: length (list of int): length of the longest common subsequence between the two strings + + Note: my_lcs only gives length of the longest common subsequence, not the actual LCS + """ + if(len(string)< len(sub)): + sub, string = string, sub + + lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)] + + for j in range(1,len(sub)+1): + for i in range(1,len(string)+1): + if(string[i-1] == sub[j-1]): + lengths[i][j] = lengths[i-1][j-1] + 1 + else: + lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1]) + + return lengths[len(string)][len(sub)] + +class Rouge(): + ''' + Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set + + ''' + def __init__(self): + # vrama91: updated the value below based on discussion with Hovey + self.beta = 1.2 + + def calc_score(self, candidate, refs): + """ + Compute ROUGE-L score given one candidate and references for an image + :param candidate: str : candidate sentence to be evaluated + :param refs: list of str : COCO reference sentences for the particular image to be evaluated + :returns score: int (ROUGE-L score for the candidate evaluated against references) + """ + assert(len(candidate)==1) + assert(len(refs)>0) + prec = [] + rec = [] + + # split into tokens + token_c = candidate[0].split(" ") + + for reference in refs: + # split into tokens + token_r = reference.split(" ") + # compute the longest common subsequence + lcs = my_lcs(token_r, token_c) + prec.append(lcs/float(len(token_c))) + rec.append(lcs/float(len(token_r))) + + prec_max = max(prec) + rec_max = max(rec) + + if(prec_max!=0 and rec_max !=0): + score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max) + else: + score = 0.0 + return score + + def compute_score(self, gts, res): + """ + Computes Rouge-L score given a set of reference and candidate sentences for the dataset + Invoked by evaluate_captions.py + :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values + :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values + :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images) + """ + assert(gts.keys() == res.keys()) + imgIds = gts.keys() + + score = [] + for id in imgIds: + hypo = res[id] + ref = gts[id] + + score.append(self.calc_score(hypo, ref)) + + # Sanity check. + assert(type(hypo) is list) + assert(len(hypo) == 1) + assert(type(ref) is list) + assert(len(ref) > 0) + + average_score = np.mean(np.array(score)) + return average_score, np.array(score) + + def method(self): + return "Rouge" diff --git a/Leaderboard Results/GUMNLGEN09042019.txt b/Leaderboard Results/GUMNLGEN09042019.txt new file mode 100644 index 0000000..75fb16c --- /dev/null +++ b/Leaderboard Results/GUMNLGEN09042019.txt @@ -0,0 +1,22 @@ +QnA Metrics +{'testlen': 595574, 'reflen': 965609, 'guess': [595574, 539985, 484398, 428846], 'correct': [262038, 105860, 62640, 41489]} +ratio: 0.6167858833130173 +############################ +F1: 0.7095818893165093 +bleu_1: 0.23637297986544653 +bleu_2: 0.15778243503336944 +bleu_3: 0.12003613413452242 +bleu_4: 0.09737202953134436 +rouge_l: 0.30710908155907457 +############################ +NLGEN Metrics +{'testlen': 179238, 'reflen': 185678, 'guess': [179238, 162761, 146284, 129817], 'correct': [81382, 44263, 28435, 18526]} +ratio: 0.9653163002617383 +############################ +F1: 1.0 +bleu_1: 0.43802018103794066 +bleu_2: 0.3389926123404134 +bleu_3: 0.278277167790246 +bleu_4: 0.23338324092925958 +rouge_l: 0.37453516033239237 +############################ diff --git a/Leaderboard Results/ranking+nlg08122019.txt b/Leaderboard Results/ranking+nlg08122019.txt new file mode 100644 index 0000000..acc5a9e --- /dev/null +++ b/Leaderboard Results/ranking+nlg08122019.txt @@ -0,0 +1,22 @@ +QnA Metrics +{'testlen': 775580, 'reflen': 966391, 'guess': [775580, 719991, 664402, 608813], 'correct': [407563, 250275, 193605, 159369]} +ratio: 0.802553003908355 +############################ +F1: 0.7095818893165093 +bleu_1: 0.4108860984489985 +bleu_2: 0.3341816312622258 +bleu_3: 0.29412686598173166 +bleu_4: 0.26863916497393286 +rouge_l: 0.4339947235728949 +############################ +NLGEN Metrics +{'testlen': 229910, 'reflen': 188685, 'guess': [229910, 213433, 196956, 180479], 'correct': [106372, 69349, 51811, 40179]} +ratio: 1.2184858361819901 +############################ +F1: 1.0 +bleu_1: 0.4626680005219414 +bleu_2: 0.3877252179823093 +bleu_3: 0.3406959353239066 +bleu_4: 0.30631504351834743 +rouge_l: 0.43941356310975876 +############################