add xnli dataset utils
This commit is contained in:
Родитель
0929c37d56
Коммит
f12aabd5b0
|
@ -6,32 +6,35 @@ https://www.nyu.edu/projects/bowman/xnli/
|
|||
"""
|
||||
|
||||
import os
|
||||
import pandas as pd
|
||||
import requests
|
||||
from utils_nlp.dataset.url_utils import extract_zip, maybe_download
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from utils_nlp.dataset.url_utils import extract_zip, maybe_download
|
||||
|
||||
URL = "https://www.nyu.edu/projects/bowman/xnli/XNLI-1.0.zip"
|
||||
|
||||
DATA_FILES = {"dev": "XNLI-1.0/xnli.dev.jsonl", "test": "XNLI-1.0/xnli.test.jsonl"}
|
||||
DATA_FILES = {
|
||||
"dev": "XNLI-1.0/xnli.dev.jsonl",
|
||||
"test": "XNLI-1.0/xnli.test.jsonl",
|
||||
}
|
||||
|
||||
|
||||
def load_pandas_df(local_cache_path=None, file_split="train"):
|
||||
"""Downloads and extracts the dataset files
|
||||
def load_pandas_df(local_cache_path=None, file_split="dev"):
|
||||
"""Downloads and extracts the dataset files
|
||||
Args:
|
||||
local_cache_path ([type], optional): [description]. Defaults to None.
|
||||
local_cache_path ([type], optional): [description].
|
||||
Defaults to None.
|
||||
file_split (str, optional): The subset to load.
|
||||
One of: {"dev", "test"}
|
||||
Defaults to "train".
|
||||
One of: {"dev", "test"}
|
||||
Defaults to "train".
|
||||
Returns:
|
||||
pd.DataFrame: pandas DataFrame containing the specified XNLI subset.
|
||||
pd.DataFrame: pandas DataFrame containing the specified
|
||||
XNLI subset.
|
||||
"""
|
||||
|
||||
file_name = URL.split("/")[-1]
|
||||
if not os.path.exists(os.path.join(local_cache_path, file_name)):
|
||||
response = requests.get(URL)
|
||||
with open(os.path.join(local_cache_path, file_name), "wb") as f:
|
||||
f.write(response.content)
|
||||
maybe_download(URL, file_name, local_cache_path)
|
||||
|
||||
if not os.path.exists(
|
||||
os.path.join(local_cache_path, DATA_FILES[file_split])
|
||||
):
|
||||
|
@ -41,4 +44,3 @@ def load_pandas_df(local_cache_path=None, file_split="train"):
|
|||
return pd.read_json(
|
||||
os.path.join(local_cache_path, DATA_FILES[file_split]), lines=True
|
||||
)
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче