Merge pull request #57 from dlazesz/fix_create_pretraining

Fix parse_data() invoked twice and some cosmetics
2020-06-30 15:13:47 -07:00 · 2020-06-30 15:13:47 -07:00 · 1129d54321
--- a/pretrain/PyTorch/dataprep/create_pretraining.py
+++ b/pretrain/PyTorch/dataprep/create_pretraining.py
@ -2,27 +2,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import collections
-import unicodedata
-import os
-import logging
 from multiprocessing import Pool
 import multiprocessing
 import os
 import logging
-import shutil
-import tempfile
 import argparse
-import json
-from urllib.parse import urlparse
-from pathlib import Path
-from typing import Optional, Tuple, Union, IO, Callable, Set
-from hashlib import sha256
-from functools import wraps
-import random
-from tqdm import tqdm
-from random import shuffle
-import pickle
+

 import sys
 sys.path.append("..")
@ -36,29 +21,31 @@ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message
 logger = logging.getLogger(__name__)


-def parse_data(input_file, output_file):
-    if not os.path.exists(output_file):
-        print(input_file)
-        dataset = GenericPretrainingDataCreator(
-            input_file, tokenizer, dupe_factor=9, max_seq_length=512)
-        dataset.save(output_file)
-        print(f"Completed Pickling: {output_file}")
+def parse_data(inp_file, out_file):
+    if not os.path.exists(out_file):
+        print(inp_file)
+        dataset = GenericPretrainingDataCreator(inp_file, tokenizer, dupe_factor=9, max_seq_length=512)
+        dataset.save(out_file)
+        print(f"Completed Pickling: {out_file}")
    else:
-        print(f'Already parsed: {output_file}')
+        print(f'Already parsed: {out_file}')


 parser = argparse.ArgumentParser(
    description="Give initial arguments for parsing")

-parser.add_argument("--input_dir", type=str, help="This folder contains .txt files of Wikipedia Data. Each .txt file contains the text from the documents. \
-                                              It makes an assumption that each line in the file represents a single line in the document too.\
-                                              A blank line represents completion of a document.")
+parser.add_argument("--input_dir", type=str,
+                    help="This folder contains .txt files of Wikipedia Data."
+                         " Each .txt file contains the text from the documents."
+                         " It makes an assumption that each line in the file represents"
+                         " a single line in the document too. A blank line represents completion of a document.")
 parser.add_argument("--output_dir", type=str, help="Path to Output Directory.")
 parser.add_argument("--token_file", default="bert-large-uncased", type=str)
 parser.add_argument("--do_lower_case", default=False, action="store_true",
                    help="This flag indicates the wheter the text should be lowercase or not")
 parser.add_argument("--processes", "-p", default=0, type=int,
-                    help="This is to do parallel processing of the txt files. It should be >=0. Default: 0 represents that it will use all the available cores in the CPU.")
+                    help="This is to do parallel processing of the txt files. It should be >=0. Default: 0 represents"
+                         " that it will use all the available cores in the CPU.")

 args = parser.parse_args()
 tokenizer = BertTokenizer.from_pretrained(
@ -82,7 +69,8 @@ for filename in os.listdir(args.input_dir):
    output_file = os.path.join(args.output_dir, outfilename)
    input_files.append(input_file)
    output_files.append(output_file)
-    parse_data(input_file, output_file)
+    # parse_data(input_file, output_file) # this line is for single core processing

+# parse data using multiple cores
 with Pool(processes=num_processes) as pool:
    pool.starmap(parse_data, zip(input_files, output_files))