Merge pull request #57 from dlazesz/fix_create_pretraining
Fix parse_data() invoked twice and some cosmetics
This commit is contained in:
Коммит
1129d54321
|
@ -2,27 +2,12 @@ from __future__ import absolute_import
|
|||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import collections
|
||||
import unicodedata
|
||||
import os
|
||||
import logging
|
||||
from multiprocessing import Pool
|
||||
import multiprocessing
|
||||
import os
|
||||
import logging
|
||||
import shutil
|
||||
import tempfile
|
||||
import argparse
|
||||
import json
|
||||
from urllib.parse import urlparse
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple, Union, IO, Callable, Set
|
||||
from hashlib import sha256
|
||||
from functools import wraps
|
||||
import random
|
||||
from tqdm import tqdm
|
||||
from random import shuffle
|
||||
import pickle
|
||||
|
||||
|
||||
import sys
|
||||
sys.path.append("..")
|
||||
|
@ -36,29 +21,31 @@ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def parse_data(input_file, output_file):
|
||||
if not os.path.exists(output_file):
|
||||
print(input_file)
|
||||
dataset = GenericPretrainingDataCreator(
|
||||
input_file, tokenizer, dupe_factor=9, max_seq_length=512)
|
||||
dataset.save(output_file)
|
||||
print(f"Completed Pickling: {output_file}")
|
||||
def parse_data(inp_file, out_file):
|
||||
if not os.path.exists(out_file):
|
||||
print(inp_file)
|
||||
dataset = GenericPretrainingDataCreator(inp_file, tokenizer, dupe_factor=9, max_seq_length=512)
|
||||
dataset.save(out_file)
|
||||
print(f"Completed Pickling: {out_file}")
|
||||
else:
|
||||
print(f'Already parsed: {output_file}')
|
||||
print(f'Already parsed: {out_file}')
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Give initial arguments for parsing")
|
||||
|
||||
parser.add_argument("--input_dir", type=str, help="This folder contains .txt files of Wikipedia Data. Each .txt file contains the text from the documents. \
|
||||
It makes an assumption that each line in the file represents a single line in the document too.\
|
||||
A blank line represents completion of a document.")
|
||||
parser.add_argument("--input_dir", type=str,
|
||||
help="This folder contains .txt files of Wikipedia Data."
|
||||
" Each .txt file contains the text from the documents."
|
||||
" It makes an assumption that each line in the file represents"
|
||||
" a single line in the document too. A blank line represents completion of a document.")
|
||||
parser.add_argument("--output_dir", type=str, help="Path to Output Directory.")
|
||||
parser.add_argument("--token_file", default="bert-large-uncased", type=str)
|
||||
parser.add_argument("--do_lower_case", default=False, action="store_true",
|
||||
help="This flag indicates the wheter the text should be lowercase or not")
|
||||
parser.add_argument("--processes", "-p", default=0, type=int,
|
||||
help="This is to do parallel processing of the txt files. It should be >=0. Default: 0 represents that it will use all the available cores in the CPU.")
|
||||
help="This is to do parallel processing of the txt files. It should be >=0. Default: 0 represents"
|
||||
" that it will use all the available cores in the CPU.")
|
||||
|
||||
args = parser.parse_args()
|
||||
tokenizer = BertTokenizer.from_pretrained(
|
||||
|
@ -82,7 +69,8 @@ for filename in os.listdir(args.input_dir):
|
|||
output_file = os.path.join(args.output_dir, outfilename)
|
||||
input_files.append(input_file)
|
||||
output_files.append(output_file)
|
||||
parse_data(input_file, output_file)
|
||||
# parse_data(input_file, output_file) # this line is for single core processing
|
||||
|
||||
# parse data using multiple cores
|
||||
with Pool(processes=num_processes) as pool:
|
||||
pool.starmap(parse_data, zip(input_files, output_files))
|
||||
|
|
Загрузка…
Ссылка в новой задаче