Merge pull request #57 from dlazesz/fix_create_pretraining

Fix parse_data() invoked twice and some cosmetics
This commit is contained in:
Xiaoyong Zhu 2020-06-30 15:13:47 -07:00 коммит произвёл GitHub
Родитель 4b27fac863 6ed915a22e
Коммит 1129d54321
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
1 изменённых файлов: 17 добавлений и 29 удалений

Просмотреть файл

@ -2,27 +2,12 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import unicodedata
import os
import logging
from multiprocessing import Pool
import multiprocessing
import os
import logging
import shutil
import tempfile
import argparse
import json
from urllib.parse import urlparse
from pathlib import Path
from typing import Optional, Tuple, Union, IO, Callable, Set
from hashlib import sha256
from functools import wraps
import random
from tqdm import tqdm
from random import shuffle
import pickle
import sys
sys.path.append("..")
@ -36,29 +21,31 @@ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message
logger = logging.getLogger(__name__)
def parse_data(input_file, output_file):
if not os.path.exists(output_file):
print(input_file)
dataset = GenericPretrainingDataCreator(
input_file, tokenizer, dupe_factor=9, max_seq_length=512)
dataset.save(output_file)
print(f"Completed Pickling: {output_file}")
def parse_data(inp_file, out_file):
if not os.path.exists(out_file):
print(inp_file)
dataset = GenericPretrainingDataCreator(inp_file, tokenizer, dupe_factor=9, max_seq_length=512)
dataset.save(out_file)
print(f"Completed Pickling: {out_file}")
else:
print(f'Already parsed: {output_file}')
print(f'Already parsed: {out_file}')
parser = argparse.ArgumentParser(
description="Give initial arguments for parsing")
parser.add_argument("--input_dir", type=str, help="This folder contains .txt files of Wikipedia Data. Each .txt file contains the text from the documents. \
It makes an assumption that each line in the file represents a single line in the document too.\
A blank line represents completion of a document.")
parser.add_argument("--input_dir", type=str,
help="This folder contains .txt files of Wikipedia Data."
" Each .txt file contains the text from the documents."
" It makes an assumption that each line in the file represents"
" a single line in the document too. A blank line represents completion of a document.")
parser.add_argument("--output_dir", type=str, help="Path to Output Directory.")
parser.add_argument("--token_file", default="bert-large-uncased", type=str)
parser.add_argument("--do_lower_case", default=False, action="store_true",
help="This flag indicates the wheter the text should be lowercase or not")
parser.add_argument("--processes", "-p", default=0, type=int,
help="This is to do parallel processing of the txt files. It should be >=0. Default: 0 represents that it will use all the available cores in the CPU.")
help="This is to do parallel processing of the txt files. It should be >=0. Default: 0 represents"
" that it will use all the available cores in the CPU.")
args = parser.parse_args()
tokenizer = BertTokenizer.from_pretrained(
@ -82,7 +69,8 @@ for filename in os.listdir(args.input_dir):
output_file = os.path.join(args.output_dir, outfilename)
input_files.append(input_file)
output_files.append(output_file)
parse_data(input_file, output_file)
# parse_data(input_file, output_file) # this line is for single core processing
# parse data using multiple cores
with Pool(processes=num_processes) as pool:
pool.starmap(parse_data, zip(input_files, output_files))