doc: add lisa case training data.

It generate a jsonl file for llm training purpose. The input is test case descriptions of a test file, output is the test case file content.
This commit is contained in:
Chi Song (from Dev Box) 2024-09-27 13:34:37 -07:00 коммит произвёл LiliDeng
Родитель cb416678a9
Коммит a768712167
3 изменённых файлов: 92 добавлений и 2 удалений

Просмотреть файл

@ -20,7 +20,11 @@ root_dir = Path(__file__).parent.parent
sys.path.insert(0, str(root_dir))
sys.path.insert(0, str(root_dir / "docs"))
from tools import update_file, update_summary # type: ignore # noqa: E402
from tools import ( # type: ignore # noqa: E402
update_file,
update_finetune_data,
update_summary,
)
# -- Project information -----------------------------------------------------
@ -87,3 +91,4 @@ base_path = Path(__file__).parent
update_summary()
update_file()
update_finetune_data()

Просмотреть файл

@ -1,7 +1,8 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from .finetune_gen import update_finetune_data
from .test_spec_gen import update_file
from .test_summary_gen import update_summary
__all__ = ["update_summary", "update_file"]
__all__ = ["update_summary", "update_file", "update_finetune_data"]

Просмотреть файл

@ -0,0 +1,84 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""
This script generates fine-tune data for AI models.
"""
import ast
import json
import os
from pathlib import Path
from typing import Dict
from .doc_generator import TESTS, ClassVisitor, FuncVisitor, extract_metadata, load_path
base_path = Path(__file__).parent
file_path = (base_path / "../_build/html/finetune_cases.jsonl").resolve()
system_prompt = "You are a helpful assistant to write Python code for linux validation."
user_prompt = "Please write LISA test cases based on the test case descriptions.\n\n\n"
def update_finetune_data() -> None:
"""
Updates (rewrites) fine tune data for AI models.
"""
data = load_path(TESTS)
test_paths = [(base_path / Path(x.get("value", ""))).resolve() for x in data]
file_path_parent = file_path.parent
if not os.path.exists(file_path_parent):
os.makedirs(file_path_parent)
with open(file_path, "w", encoding="utf-8") as f:
for test_path in test_paths:
for root, _, files in os.walk(test_path):
for file in files:
if file.endswith(".py"):
descriptions = "test suite description:\n"
test_name = Path(root) / file
tree = ast.parse(
test_name.read_text(encoding="utf-8"),
filename=str(test_name),
)
cls_visitor = ClassVisitor()
func_visitor = FuncVisitor()
cls_visitor.visit(tree)
func_visitor.visit(tree)
for suite_metadata in extract_metadata(
cls_visitor.get_suites()
):
descriptions += _get_description(suite_metadata, True)
descriptions += "\n\n"
for case in extract_metadata(func_visitor.get_cases()):
descriptions += "test case:\n"
descriptions += _get_description(case)
descriptions += "\n\n"
source = test_name.read_text(encoding="utf-8")
f.write(
json.dumps(
{
"messages": [
{"role": "system", "content": system_prompt},
{
"role": "user",
"content": user_prompt + descriptions,
},
{"role": "assistant", "content": source},
]
}
)
+ "\n"
)
def _get_description(metadata: Dict[str, str], is_suite: bool = False) -> str:
text = metadata["description"].split("\n")
# filter out empty lines
res = filter(lambda line: not line.isspace() and line != "", text)
text = list(res)
return "\n".join(text)