NeuronBlocks/metrics/conlleval.py

317 строки
13 KiB
Python

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
from __future__ import division, print_function, unicode_literals
import argparse
import sys
import re
from collections import defaultdict
import logging
SENT_BOUNDARY = "-X-" # sentence boundary
def to_conll_format(y_true, y_pred):
""" transform prediction and answer to the conll format
Args:
y_true: 2d array, [number of sentence, variable_sentence_length]
y_pred: 2d array, [number of sentence, variable_sentence_length]
Returns:
2d array in conll output format. [number of tokens + number of sentence, 3]
e.g. [
['', y_true, y_pred],
...,
[SENT_BOUNDARY, 'O', 'O'], # as sentence boundary
['', y_true, y_pred], # next sentence
...
]
"""
import codecs
fout = codecs.open("debug.txt", 'w', encoding='utf-8')
result_conll = []
for target_sent, pred_sent in zip(y_true, y_pred):
for target, pred in zip(target_sent, pred_sent):
result_conll.append(['```', target, pred])
fout.write("```` %s %s\n" % (target, pred))
result_conll.append([SENT_BOUNDARY, "O", "O"]) # sentence boundary
fout.write("%s %s %s\n" % (SENT_BOUNDARY, "O", "O"))
fout.close()
return result_conll
# sanity check
def parse_args():
argparser = argparse.ArgumentParser()
argparser.add_argument(
"-f", "--file_path",
help="input file path"
)
argparser.add_argument(
"-l", "--latex",
default=False, action="store_true",
help="generate LaTeX output"
)
argparser.add_argument(
"-r", "--raw",
default=False, action="store_true",
help="accept raw result tags"
)
argparser.add_argument(
"-d", "--delimiter",
default=None,
help="alternative delimiter tag (default: single space)"
)
argparser.add_argument(
"-o", "--oTag",
default="O",
help="alternative delimiter tag (default: O)"
)
args = argparser.parse_args()
return args
"""
• IOB1: I is a token inside a chunk, O is a token outside a chunk and B is the
beginning of chunk immediately following another chunk of the same Named Entity.
• IOB2: It is same as IOB1, except that a B tag is given for every token, which exists at
the beginning of the chunk.
• IOE1: An E tag used to mark the last token of a chunk immediately preceding another
chunk of the same named entity.
• IOE2: It is same as IOE1, except that an E tag is given for every token, which exists at
the end of the chunk.
• START/END: This consists of the tags B, E, I, S or O where S is used to represent a
chunk containing a single token. Chunks of length greater than or equal to two always
start with the B tag and end with the E tag.
• IO: Here, only the I and O labels are used. This therefore cannot distinguish between
adjacent chunks of the same named entity.
"""
# endOfChunk: checks if a chunk ended between the previous and current word
# arguments: previous and current chunk tags, previous and current types
# note: this code is capable of handling other chunk representations
# than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
# Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
def endOfChunk(prevTag, tag, prevType, type):
"""
checks if a chunk ended between the previous and current word;
arguments: previous and current chunk tags, previous and current types
"""
return ((prevTag == "B" and tag == "B") or
(prevTag == "B" and tag == "O") or
(prevTag == "I" and tag == "B") or
(prevTag == "I" and tag == "O") or
(prevTag == "E" and tag == "E") or
(prevTag == "E" and tag == "I") or
(prevTag == "E" and tag == "O") or
(prevTag == "I" and tag == "O") or
(prevTag != "O" and prevTag != "." and prevType != type) or
(prevTag == "]" or prevTag == "["))
# corrected 1998-12-22: these chunks are assumed to have length 1
# startOfChunk: checks if a chunk started between the previous and current word
# arguments: previous and current chunk tags, previous and current types
# note: this code is capable of handling other chunk representations
# than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
# Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
def startOfChunk(prevTag, tag, prevType, type):
"""
checks if a chunk started between the previous and current word;
arguments: previous and current chunk tags, previous and current types
"""
chunkStart = ((prevTag == "B" and tag == "B") or
(prevTag == "B" and tag == "B") or
(prevTag == "I" and tag == "B") or
(prevTag == "O" and tag == "B") or
(prevTag == "O" and tag == "I") or
(prevTag == "E" and tag == "E") or
(prevTag == "E" and tag == "I") or
(prevTag == "O" and tag == "E") or
(prevTag == "O" and tag == "I") or
(tag != "O" and tag != "." and prevType != type) or
(tag == "]" or tag == "["))
# corrected 1998-12-22: these chunks are assumed to have length 1
#logging.info("startOfChunk?", prevTag, tag, prevType, type)
#logging.info(chunkStart)
return chunkStart
def calcMetrics(TP, P, T, percent=True):
"""
compute overall precision, recall and FB1 (default values are 0.0)
if percent is True, return 100 * original decimal value
"""
precision = TP / P if P else 0
recall = TP / T if T else 0
FB1 = 2 * precision * recall / (precision + recall) if precision + recall else 0
if percent:
return 100 * precision, 100 * recall, 100 * FB1
else:
return precision, recall, FB1
def splitTag(chunkTag, oTag = "O", raw = False, defaultType="NONE"):
"""
Split chunk tag into IOB tag and chunk type;
return (iob_tag, chunk_type)
"""
if chunkTag == "O" or chunkTag == oTag:
tag, type = "O", defaultType
elif raw:
tag, type = "B", chunkTag
else:
try:
# split on first hyphen, allowing hyphen in type
tag, type = chunkTag.split('-', 1)
except ValueError:
tag, type = chunkTag, defaultType
return tag, type
def countChunks(fileIterator, delimiter=None, raw=False, oTag="O"):
"""
Process input in given format and count chunks using the last two columns;
return correctChunk, foundGuessed, foundCorrect, correctTags, tokenCounter
Args:
fileIterator: either an input stream/stdin, or a list (Note that len(fileIterator[0]) >= 3.
list example:
[
['', y_true, y_pred],
...,
[SENT_BOUNDARY, 'O', 'O'], # as sentence boundary
['', y_true, y_pred], # next sentence
...
]
"""
correctChunk = defaultdict(int) # number of correctly identified chunks
foundCorrect = defaultdict(int) # number of chunks in corpus per type
foundGuessed = defaultdict(int) # number of identified chunks per type
tokenCounter = 0 # token counter (ignores sentence breaks)
correctTags = 0 # number of correct chunk tags
lastType = None # temporary storage for detecting duplicates
inCorrect = False # currently processed chunk is correct until now
lastCorrect, lastCorrectType = "O", None # previous chunk tag in corpus
lastGuessed, lastGuessedType = "O", None # previously identified chunk tag
for line in fileIterator:
# each non-empty line must contain >= 3 columns
if isinstance(line, str):
features = line.strip().split(delimiter)
else: # support online evaluation
features = line
if not features or features[0] == SENT_BOUNDARY: # insert an sentence boundary
features = [SENT_BOUNDARY, oTag, oTag]
elif len(features) < 3:
raise IOError("conlleval: unexpected number of features in line %s\n" % line)
# extract tags from last 2 columns
guessed, guessedType = splitTag(features[-1], oTag=oTag, raw=raw)
correct, correctType = splitTag(features[-2], oTag=oTag, raw=raw)
# 1999-06-26 sentence breaks should always be counted as out of chunk
firstItem = features[0]
if firstItem == SENT_BOUNDARY:
guessed, guessedType = "O", None
# decide whether current chunk is correct until now
if inCorrect:
endOfGuessed = endOfChunk(lastCorrect, correct, lastCorrectType, correctType)
endOfCorrect = endOfChunk(lastGuessed, guessed, lastGuessedType, guessedType)
if (endOfGuessed and endOfCorrect and lastGuessedType == lastCorrectType):
inCorrect = False
correctChunk[lastCorrectType] += 1
elif ( endOfGuessed != endOfCorrect or guessedType != correctType):
inCorrect = False
startOfGuessed = startOfChunk(lastGuessed, guessed, lastGuessedType, guessedType)
startOfCorrect = startOfChunk(lastCorrect, correct, lastCorrectType, correctType)
if (startOfCorrect and startOfGuessed and guessedType == correctType):
inCorrect = True
if startOfCorrect:
foundCorrect[correctType] += 1
if startOfGuessed:
foundGuessed[guessedType] += 1
if firstItem != SENT_BOUNDARY:
if correct == guessed and guessedType == correctType:
correctTags += 1
tokenCounter += 1
lastGuessed, lastGuessedType = guessed, guessedType
lastCorrect, lastCorrectType = correct, correctType
if inCorrect:
correctChunk[lastCorrectType] += 1
return correctChunk, foundGuessed, foundCorrect, correctTags, tokenCounter
def evaluate(correctChunk, foundGuessed, foundCorrect, correctTags, tokenCounter, latex=False):
# sum counts
correctChunkSum = sum(correctChunk.values())
foundGuessedSum = sum(foundGuessed.values())
foundCorrectSum = sum(foundCorrect.values())
# sort chunk type names
sortedTypes = list(foundCorrect) + list(foundGuessed)
sortedTypes = list(set(sortedTypes))
sortedTypes.sort()
# print overall performance, and performance per chunk type
if not latex:
# compute overall precision, recall and FB1 (default values are 0.0)
overall_precision, overall_recall, overall_FB1 = calcMetrics(correctChunkSum, foundGuessedSum, foundCorrectSum)
# print overall performance
logging.info("processed %i tokens with %i phrases; found: %i phrases; correct: %i." % (tokenCounter, foundCorrectSum, foundGuessedSum, correctChunkSum))
if tokenCounter:
logging.info("accuracy: %6.2f%%; precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f" % (100*correctTags/tokenCounter, overall_precision, overall_recall, overall_FB1))
for i in sortedTypes:
precision, recall, FB1 = calcMetrics(correctChunk[i], foundGuessed[i], foundCorrect[i])
logging.info("%17s: precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f %d" %
(i, precision, recall, FB1, foundGuessed[i]))
# generate LaTeX output for tables like in
# http://cnts.uia.ac.be/conll2003/ner/example.tex
else:
output_str = ''
output_str += " & Precision & Recall & F\$_{\\beta=1} \\\\\\hline"
for i in sortedTypes:
precision, recall, FB1 = calcMetrics(correctChunk[i], foundGuessed[i], foundCorrect[i])
output_str += "\n%-7s & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\" % (i,precision,recall,FB1)
logging.info(output_str)
logging.info("\\hline")
overall_precision, overall_recall, overall_FB1 = calcMetrics(correctChunkSum, foundGuessedSum, foundCorrectSum)
logging.info("Overall & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline" %
(overall_precision, overall_recall, overall_FB1))
return overall_precision, overall_recall, overall_FB1
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format='%(message)s')
args = parse_args()
# process input and count chunks
if args.file_path:
with open(args.file_path) as fin:
correctChunk, foundGuessed, foundCorrect, correctTags, tokenCounter = countChunks(fin, args.delimiter, args.raw, args.oTag)
else:
correctChunk, foundGuessed, foundCorrect, correctTags, tokenCounter = countChunks(sys.stdin, args.delimiter, args.raw, args.oTag)
# compute metrics and print
overall_precision, overall_recall, overall_FB1 = evaluate(correctChunk, foundGuessed, foundCorrect, correctTags, tokenCounter, latex=args.latex)
sys.exit(0)