зеркало из https://github.com/microsoft/archai.git
chore(research): Renames harness to lm_eval_harness research project.
This commit is contained in:
Родитель
81ffdd907b
Коммит
b0a3436e04
|
@ -0,0 +1,3 @@
|
|||
# HELM-Harness
|
||||
|
||||
## Installation
|
|
@ -6,10 +6,10 @@ from setuptools import find_packages, setup
|
|||
install_requires = [r.rstrip() for r in open("requirements.txt", "r").readlines()]
|
||||
|
||||
setup(
|
||||
name="harness",
|
||||
version="0.1",
|
||||
name="helm_harness",
|
||||
version="0.0.1",
|
||||
author="Microsoft",
|
||||
url="https://github.com/microsoft/archai/research/harness",
|
||||
url="https://github.com/microsoft/archai/research/helm_harness",
|
||||
license="MIT",
|
||||
install_requires=install_requires,
|
||||
packages=find_packages(),
|
|
@ -1,19 +1,19 @@
|
|||
# Harness
|
||||
# LM-Eval-Harness
|
||||
|
||||
## Installation
|
||||
|
||||
To install Harness, run the following commands in your command line:
|
||||
To install `lm_eval_harness`, run the following commands in your command line:
|
||||
|
||||
```shell
|
||||
conda create -n harness python=3.8
|
||||
conda activate harness
|
||||
conda create -n lm_eval_harness python=3.8
|
||||
conda activate lm_eval_harness
|
||||
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
## Evaluating with LM-Evaluation-Harness (lm-eval)
|
||||
## Evaluating with `lm_eval_harness`
|
||||
|
||||
To evaluate your model with LM-Evaluation-Harness, run the following command:
|
||||
To evaluate your model with `lm_eval_harness`, run the following command:
|
||||
|
||||
```shell
|
||||
python evaluate_with_lm_eval.py --help
|
|
@ -4,14 +4,13 @@
|
|||
import argparse
|
||||
import json
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
from harness.lm_eval_evaluator import evaluate_wrapper
|
||||
from harness.lm_eval_hf_model import HFEvalModel
|
||||
from harness.tasks.human_eval import HumanEval
|
||||
from harness.utils.regex import MultiChoice, pattern_match
|
||||
from lm_eval.evaluator import make_table
|
||||
from lm_eval.tasks import ALL_TASKS, TASK_REGISTRY
|
||||
from lm_eval_harness.lm_eval_evaluator import evaluate_wrapper
|
||||
from lm_eval_harness.lm_eval_hf_model import HFEvalModel
|
||||
from lm_eval_harness.tasks.human_eval import HumanEval
|
||||
from lm_eval_harness.utils.regex import MultiChoice, pattern_match
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
# Ensures additional tasks are loaded and registered
|
||||
ALL_TASKS.append("human_eval")
|
||||
|
@ -112,7 +111,7 @@ if __name__ == "__main__":
|
|||
model = AutoModelForCausalLM.from_pretrained(args.pre_trained_model_path)
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.hub_tokenizer_path)
|
||||
hf_model = HFEvalModel(model, tokenizer)
|
||||
|
||||
|
||||
outputs = evaluate_wrapper(
|
||||
hf_model,
|
||||
task_names,
|
|
@ -2,15 +2,15 @@
|
|||
# Licensed under the MIT license.
|
||||
|
||||
import random
|
||||
from typing import List, Optional
|
||||
from hashlib import sha1
|
||||
from typing import List, Optional
|
||||
|
||||
import numpy as np
|
||||
from harness.lm_eval_hf_model import HFEvalModel
|
||||
from lm_eval.base import CachingLM
|
||||
from lm_eval.evaluator import evaluate
|
||||
from lm_eval.tasks import get_task_dict
|
||||
from lm_eval.utils import run_task_tests
|
||||
from lm_eval_harness.lm_eval_hf_model import HFEvalModel
|
||||
|
||||
|
||||
def evaluate_wrapper(
|
|
@ -4,16 +4,20 @@
|
|||
from typing import List, Optional
|
||||
|
||||
import torch
|
||||
from harness.utils.multiple_token_stopping_criteria import MultipleTokenStoppingCriteria
|
||||
from harness.utils.request_factory import Request
|
||||
from lm_eval.base import BaseLM
|
||||
from lm_eval_harness.utils.multiple_token_stopping_criteria import (
|
||||
MultipleTokenStoppingCriteria,
|
||||
)
|
||||
from lm_eval_harness.utils.request_factory import Request
|
||||
from tqdm import tqdm
|
||||
from transformers.generation.stopping_criteria import StoppingCriteriaList
|
||||
from transformers.tokenization_utils import PreTrainedTokenizer
|
||||
|
||||
|
||||
class HFEvalModel(BaseLM):
|
||||
def __init__(self, model: torch.nn.Module, tokenizer: PreTrainedTokenizer, force_attention_mask: Optional[bool] = False) -> None:
|
||||
def __init__(
|
||||
self, model: torch.nn.Module, tokenizer: PreTrainedTokenizer, force_attention_mask: Optional[bool] = False
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self._device = torch.device("cpu")
|
|
@ -0,0 +1,2 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
|
@ -6,9 +6,9 @@ from typing import Any, Dict, List
|
|||
|
||||
from datasets.arrow_dataset import Dataset
|
||||
from evaluate import load
|
||||
from harness.utils.request_factory import Request, rf
|
||||
from lm_eval.base import Task
|
||||
from lm_eval.metrics import mean
|
||||
from lm_eval_harness.utils.request_factory import Request, rf
|
||||
|
||||
# Allow code evaluation
|
||||
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
|
|
@ -0,0 +1,2 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
|
@ -0,0 +1,17 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
install_requires = [r.rstrip() for r in open("requirements.txt", "r").readlines()]
|
||||
|
||||
setup(
|
||||
name="lm_eval_harness",
|
||||
version="0.0.1",
|
||||
author="Microsoft",
|
||||
url="https://github.com/microsoft/archai/research/lm_eval_harness",
|
||||
license="MIT",
|
||||
install_requires=install_requires,
|
||||
packages=find_packages(),
|
||||
include_package_data=True,
|
||||
)
|
|
@ -5,9 +5,9 @@
|
|||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# How-To Evaluate a Custom Task with LM-Eval\n",
|
||||
"# How-To Evaluate a Custom Task with LM-Eval Harness\n",
|
||||
"\n",
|
||||
"Even though `lm-eval` framework supports more than 200 tasks, one might want to implement an additional one. With that in mind, this tutorial walks through the process of creating a custom task, including it in the registry and evaluating models with it."
|
||||
"Even though `lm_eval` framework supports more than 200 tasks, one might want to implement an additional one. With that in mind, this tutorial walks through the process of creating a custom task, including it in the registry and evaluating models with it."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -17,7 +17,7 @@
|
|||
"source": [
|
||||
"## Installation\n",
|
||||
"\n",
|
||||
"The `harness` project is designed to be an installable module, which allow users to call it from outside its package. Thus, one can install it as follows:"
|
||||
"The `lm_eval_harness` project is designed to be an installable module, which allow users to call it from outside its package. Thus, one can install it as follows:"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -27,9 +27,9 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"try:\n",
|
||||
" import harness\n",
|
||||
" import lm_eval_harness\n",
|
||||
"except ModuleNotFoundError:\n",
|
||||
" !pip install git+https://github.com/microsoft/archai.git@pre-release#subdirectory=research/harness"
|
||||
" !pip install git+https://github.com/microsoft/archai.git@pre-release#subdirectory=research/lm_eval_harness"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -75,7 +75,7 @@
|
|||
"from typing import Any, Dict, List\n",
|
||||
"\n",
|
||||
"from datasets.arrow_dataset import Dataset\n",
|
||||
"from harness.utils.request_factory import Request, rf\n",
|
||||
"from lm_eval_harness.utils.request_factory import Request, rf\n",
|
||||
"from lm_eval.base import Task\n",
|
||||
"from lm_eval.metrics import mean\n",
|
||||
"\n",
|
||||
|
@ -222,8 +222,8 @@
|
|||
"\n",
|
||||
"from lm_eval.evaluator import make_table\n",
|
||||
"\n",
|
||||
"from harness.lm_eval_evaluator import evaluate_wrapper\n",
|
||||
"from harness.lm_eval_hf_model import HFEvalModel\n",
|
||||
"from lm_eval_harness.lm_eval_evaluator import evaluate_wrapper\n",
|
||||
"from lm_eval_harness.lm_eval_hf_model import HFEvalModel\n",
|
||||
"\n",
|
||||
"model = AutoModelForCausalLM.from_pretrained(\"gpt2\")\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
|
|
@ -5,9 +5,9 @@
|
|||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# How-To Evaluate Models with LM-Eval\n",
|
||||
"# How-To Evaluate Models with LM-Eval Harness\n",
|
||||
"\n",
|
||||
"The `harness` research project implements a wrapper over the `lm-eval` framework, provided by EleutherAI. It is designed to make it easy to evaluate NLP models and compare their performance. In this tutorial, we will walk through the process of evaluating NLP models with `harness`, including how to set up the framework, how to use it to evaluate models, and how to interpret the results."
|
||||
"The `lm_eval_harness` research project implements a wrapper over the `lm_eval` framework, provided by EleutherAI. It is designed to make it easy to evaluate NLP models and compare their performance. In this tutorial, we will walk through the process of evaluating NLP models with `lm_eval_harness`, including how to set up the framework, how to use it to evaluate models, and how to interpret the results."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -17,19 +17,40 @@
|
|||
"source": [
|
||||
"## Installation\n",
|
||||
"\n",
|
||||
"The `harness` project is designed to be an installable module, which allow users to call it from outside its package. Thus, one can install it as follows:"
|
||||
"The `lm_eval_harness` project is designed to be an installable module, which allow users to call it from outside its package. Thus, one can install it as follows:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Collecting git+https://github.com/microsoft/archai.git@pre-release#subdirectory=research/lm_eval_harness\n",
|
||||
" Cloning https://github.com/microsoft/archai.git (to revision pre-release) to c:\\users\\gderosa\\appdata\\local\\temp\\pip-req-build-2q9113pq\n",
|
||||
" Resolved https://github.com/microsoft/archai.git to commit 81ffdd907b9485e3663f1ddbf32e2f862a65f4fe\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Running command git clone --filter=blob:none --quiet https://github.com/microsoft/archai.git 'C:\\Users\\gderosa\\AppData\\Local\\Temp\\pip-req-build-2q9113pq'\n",
|
||||
" Running command git checkout -b pre-release --track origin/pre-release\n",
|
||||
" Branch 'pre-release' set up to track remote branch 'pre-release' from 'origin'.\n",
|
||||
" Switched to a new branch 'pre-release'\n",
|
||||
"ERROR: git+https://github.com/microsoft/archai.git@pre-release#subdirectory=research/lm_eval_harness does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"try:\n",
|
||||
" import harness\n",
|
||||
" import lm_eval_harness\n",
|
||||
"except ModuleNotFoundError:\n",
|
||||
" !pip install git+https://github.com/microsoft/archai.git@pre-release#subdirectory=research/harness"
|
||||
" !pip install git+https://github.com/microsoft/archai.git@pre-release#subdirectory=research/lm_eval_harness"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -46,12 +67,24 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "ModuleNotFoundError",
|
||||
"evalue": "No module named 'lm_eval_harness'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_388\\1302793399.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mtransformers\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mAutoModelForCausalLM\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mAutoTokenizer\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mlm_eval_harness\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlm_eval_hf_model\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mHFEvalModel\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mAutoModelForCausalLM\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfrom_pretrained\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"gpt2\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mtokenizer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mAutoTokenizer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfrom_pretrained\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"gpt2\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'lm_eval_harness'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
|
||||
"from harness.lm_eval_hf_model import HFEvalModel\n",
|
||||
"from lm_eval_harness.lm_eval_hf_model import HFEvalModel\n",
|
||||
"\n",
|
||||
"model = AutoModelForCausalLM.from_pretrained(\"gpt2\")\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
|
||||
|
@ -83,7 +116,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -138,7 +171,7 @@
|
|||
],
|
||||
"source": [
|
||||
"from lm_eval.tasks import ALL_TASKS\n",
|
||||
"from harness.lm_eval_evaluator import evaluate_wrapper\n",
|
||||
"from lm_eval_harness.lm_eval_evaluator import evaluate_wrapper\n",
|
||||
"\n",
|
||||
"print(f\"List of tasks: {ALL_TASKS}\")\n",
|
||||
"\n",
|
||||
|
@ -162,7 +195,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
Загрузка…
Ссылка в новой задаче