chore(research): Renames harness to lm_eval_harness research project.

This commit is contained in:
Gustavo Rosa 2023-01-24 16:02:58 -03:00
Родитель 81ffdd907b
Коммит b0a3436e04
21 изменённых файлов: 103 добавлений и 43 удалений

Просмотреть файл

Просмотреть файл

@ -0,0 +1,3 @@
# HELM-Harness
## Installation

Просмотреть файл

Просмотреть файл

@ -6,10 +6,10 @@ from setuptools import find_packages, setup
install_requires = [r.rstrip() for r in open("requirements.txt", "r").readlines()]
setup(
name="harness",
version="0.1",
name="helm_harness",
version="0.0.1",
author="Microsoft",
url="https://github.com/microsoft/archai/research/harness",
url="https://github.com/microsoft/archai/research/helm_harness",
license="MIT",
install_requires=install_requires,
packages=find_packages(),

Просмотреть файл

Просмотреть файл

@ -1,19 +1,19 @@
# Harness
# LM-Eval-Harness
## Installation
To install Harness, run the following commands in your command line:
To install `lm_eval_harness`, run the following commands in your command line:
```shell
conda create -n harness python=3.8
conda activate harness
conda create -n lm_eval_harness python=3.8
conda activate lm_eval_harness
pip install -e .
```
## Evaluating with LM-Evaluation-Harness (lm-eval)
## Evaluating with `lm_eval_harness`
To evaluate your model with LM-Evaluation-Harness, run the following command:
To evaluate your model with `lm_eval_harness`, run the following command:
```shell
python evaluate_with_lm_eval.py --help

Просмотреть файл

@ -4,14 +4,13 @@
import argparse
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
from harness.lm_eval_evaluator import evaluate_wrapper
from harness.lm_eval_hf_model import HFEvalModel
from harness.tasks.human_eval import HumanEval
from harness.utils.regex import MultiChoice, pattern_match
from lm_eval.evaluator import make_table
from lm_eval.tasks import ALL_TASKS, TASK_REGISTRY
from lm_eval_harness.lm_eval_evaluator import evaluate_wrapper
from lm_eval_harness.lm_eval_hf_model import HFEvalModel
from lm_eval_harness.tasks.human_eval import HumanEval
from lm_eval_harness.utils.regex import MultiChoice, pattern_match
from transformers import AutoModelForCausalLM, AutoTokenizer
# Ensures additional tasks are loaded and registered
ALL_TASKS.append("human_eval")
@ -112,7 +111,7 @@ if __name__ == "__main__":
model = AutoModelForCausalLM.from_pretrained(args.pre_trained_model_path)
tokenizer = AutoTokenizer.from_pretrained(args.hub_tokenizer_path)
hf_model = HFEvalModel(model, tokenizer)
outputs = evaluate_wrapper(
hf_model,
task_names,

Просмотреть файл

@ -2,15 +2,15 @@
# Licensed under the MIT license.
import random
from typing import List, Optional
from hashlib import sha1
from typing import List, Optional
import numpy as np
from harness.lm_eval_hf_model import HFEvalModel
from lm_eval.base import CachingLM
from lm_eval.evaluator import evaluate
from lm_eval.tasks import get_task_dict
from lm_eval.utils import run_task_tests
from lm_eval_harness.lm_eval_hf_model import HFEvalModel
def evaluate_wrapper(

Просмотреть файл

@ -4,16 +4,20 @@
from typing import List, Optional
import torch
from harness.utils.multiple_token_stopping_criteria import MultipleTokenStoppingCriteria
from harness.utils.request_factory import Request
from lm_eval.base import BaseLM
from lm_eval_harness.utils.multiple_token_stopping_criteria import (
MultipleTokenStoppingCriteria,
)
from lm_eval_harness.utils.request_factory import Request
from tqdm import tqdm
from transformers.generation.stopping_criteria import StoppingCriteriaList
from transformers.tokenization_utils import PreTrainedTokenizer
class HFEvalModel(BaseLM):
def __init__(self, model: torch.nn.Module, tokenizer: PreTrainedTokenizer, force_attention_mask: Optional[bool] = False) -> None:
def __init__(
self, model: torch.nn.Module, tokenizer: PreTrainedTokenizer, force_attention_mask: Optional[bool] = False
) -> None:
super().__init__()
self._device = torch.device("cpu")

Просмотреть файл

@ -0,0 +1,2 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

Просмотреть файл

@ -6,9 +6,9 @@ from typing import Any, Dict, List
from datasets.arrow_dataset import Dataset
from evaluate import load
from harness.utils.request_factory import Request, rf
from lm_eval.base import Task
from lm_eval.metrics import mean
from lm_eval_harness.utils.request_factory import Request, rf
# Allow code evaluation
os.environ["HF_ALLOW_CODE_EVAL"] = "1"

Просмотреть файл

@ -0,0 +1,2 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

Просмотреть файл

@ -0,0 +1,17 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from setuptools import find_packages, setup
install_requires = [r.rstrip() for r in open("requirements.txt", "r").readlines()]
setup(
name="lm_eval_harness",
version="0.0.1",
author="Microsoft",
url="https://github.com/microsoft/archai/research/lm_eval_harness",
license="MIT",
install_requires=install_requires,
packages=find_packages(),
include_package_data=True,
)

Просмотреть файл

@ -5,9 +5,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# How-To Evaluate a Custom Task with LM-Eval\n",
"# How-To Evaluate a Custom Task with LM-Eval Harness\n",
"\n",
"Even though `lm-eval` framework supports more than 200 tasks, one might want to implement an additional one. With that in mind, this tutorial walks through the process of creating a custom task, including it in the registry and evaluating models with it."
"Even though `lm_eval` framework supports more than 200 tasks, one might want to implement an additional one. With that in mind, this tutorial walks through the process of creating a custom task, including it in the registry and evaluating models with it."
]
},
{
@ -17,7 +17,7 @@
"source": [
"## Installation\n",
"\n",
"The `harness` project is designed to be an installable module, which allow users to call it from outside its package. Thus, one can install it as follows:"
"The `lm_eval_harness` project is designed to be an installable module, which allow users to call it from outside its package. Thus, one can install it as follows:"
]
},
{
@ -27,9 +27,9 @@
"outputs": [],
"source": [
"try:\n",
" import harness\n",
" import lm_eval_harness\n",
"except ModuleNotFoundError:\n",
" !pip install git+https://github.com/microsoft/archai.git@pre-release#subdirectory=research/harness"
" !pip install git+https://github.com/microsoft/archai.git@pre-release#subdirectory=research/lm_eval_harness"
]
},
{
@ -75,7 +75,7 @@
"from typing import Any, Dict, List\n",
"\n",
"from datasets.arrow_dataset import Dataset\n",
"from harness.utils.request_factory import Request, rf\n",
"from lm_eval_harness.utils.request_factory import Request, rf\n",
"from lm_eval.base import Task\n",
"from lm_eval.metrics import mean\n",
"\n",
@ -222,8 +222,8 @@
"\n",
"from lm_eval.evaluator import make_table\n",
"\n",
"from harness.lm_eval_evaluator import evaluate_wrapper\n",
"from harness.lm_eval_hf_model import HFEvalModel\n",
"from lm_eval_harness.lm_eval_evaluator import evaluate_wrapper\n",
"from lm_eval_harness.lm_eval_hf_model import HFEvalModel\n",
"\n",
"model = AutoModelForCausalLM.from_pretrained(\"gpt2\")\n",
"tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",

Просмотреть файл

@ -5,9 +5,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# How-To Evaluate Models with LM-Eval\n",
"# How-To Evaluate Models with LM-Eval Harness\n",
"\n",
"The `harness` research project implements a wrapper over the `lm-eval` framework, provided by EleutherAI. It is designed to make it easy to evaluate NLP models and compare their performance. In this tutorial, we will walk through the process of evaluating NLP models with `harness`, including how to set up the framework, how to use it to evaluate models, and how to interpret the results."
"The `lm_eval_harness` research project implements a wrapper over the `lm_eval` framework, provided by EleutherAI. It is designed to make it easy to evaluate NLP models and compare their performance. In this tutorial, we will walk through the process of evaluating NLP models with `lm_eval_harness`, including how to set up the framework, how to use it to evaluate models, and how to interpret the results."
]
},
{
@ -17,19 +17,40 @@
"source": [
"## Installation\n",
"\n",
"The `harness` project is designed to be an installable module, which allow users to call it from outside its package. Thus, one can install it as follows:"
"The `lm_eval_harness` project is designed to be an installable module, which allow users to call it from outside its package. Thus, one can install it as follows:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting git+https://github.com/microsoft/archai.git@pre-release#subdirectory=research/lm_eval_harness\n",
" Cloning https://github.com/microsoft/archai.git (to revision pre-release) to c:\\users\\gderosa\\appdata\\local\\temp\\pip-req-build-2q9113pq\n",
" Resolved https://github.com/microsoft/archai.git to commit 81ffdd907b9485e3663f1ddbf32e2f862a65f4fe\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" Running command git clone --filter=blob:none --quiet https://github.com/microsoft/archai.git 'C:\\Users\\gderosa\\AppData\\Local\\Temp\\pip-req-build-2q9113pq'\n",
" Running command git checkout -b pre-release --track origin/pre-release\n",
" Branch 'pre-release' set up to track remote branch 'pre-release' from 'origin'.\n",
" Switched to a new branch 'pre-release'\n",
"ERROR: git+https://github.com/microsoft/archai.git@pre-release#subdirectory=research/lm_eval_harness does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.\n"
]
}
],
"source": [
"try:\n",
" import harness\n",
" import lm_eval_harness\n",
"except ModuleNotFoundError:\n",
" !pip install git+https://github.com/microsoft/archai.git@pre-release#subdirectory=research/harness"
" !pip install git+https://github.com/microsoft/archai.git@pre-release#subdirectory=research/lm_eval_harness"
]
},
{
@ -46,12 +67,24 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'lm_eval_harness'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_388\\1302793399.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mtransformers\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mAutoModelForCausalLM\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mAutoTokenizer\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mlm_eval_harness\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlm_eval_hf_model\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mHFEvalModel\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mAutoModelForCausalLM\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfrom_pretrained\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"gpt2\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mtokenizer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mAutoTokenizer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfrom_pretrained\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"gpt2\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'lm_eval_harness'"
]
}
],
"source": [
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
"from harness.lm_eval_hf_model import HFEvalModel\n",
"from lm_eval_harness.lm_eval_hf_model import HFEvalModel\n",
"\n",
"model = AutoModelForCausalLM.from_pretrained(\"gpt2\")\n",
"tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
@ -83,7 +116,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [
{
@ -138,7 +171,7 @@
],
"source": [
"from lm_eval.tasks import ALL_TASKS\n",
"from harness.lm_eval_evaluator import evaluate_wrapper\n",
"from lm_eval_harness.lm_eval_evaluator import evaluate_wrapper\n",
"\n",
"print(f\"List of tasks: {ALL_TASKS}\")\n",
"\n",
@ -162,7 +195,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [
{