Merge pull request #72 from microsoft/abhiram-embedding-fix

Fix to limit the memory usage when using fasttext embedding loaders
This commit is contained in:
Said Bleik 2019-05-28 13:02:39 -04:00 коммит произвёл GitHub
Родитель 2dc37f87eb 36d7411bec
Коммит 96b3015096
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 11 добавлений и 7 удалений

Просмотреть файл

@ -35,8 +35,8 @@ def test_load_pretrained_vectors_glove(tmp_path):
@pytest.mark.smoke
def test_load_pretrained_vectors_fasttext(tmp_path):
filename = "wiki.en.bin"
filename = "wiki.simple.bin"
load_fasttext(tmp_path)
filepath = os.path.join(os.path.join(tmp_path, "fastText"), filename)
statinfo = os.stat(filepath)
assert statinfo.st_size == 8493673445
assert statinfo.st_size == 2668450750

Просмотреть файл

@ -57,7 +57,7 @@ def test_load_pretrained_vectors_glove():
def test_load_pretrained_vectors_fasttext():
dir_path = "temp_data/"
file_path = os.path.join(os.path.join(dir_path, "fastText"), "wiki.en.bin")
file_path = os.path.join(os.path.join(dir_path, "fastText"), "wiki.simple.bin")
assert isinstance(load_fasttext(dir_path), FastText)

Просмотреть файл

@ -2,5 +2,5 @@
# Licensed under the MIT License.
WORD2VEC_URL = 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'
FASTTEXT_EN_URL = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip'
FASTTEXT_EN_URL = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.zip'
GLOVE_URL = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'

Просмотреть файл

@ -32,10 +32,14 @@ def _extract_fasttext_vectors(zip_path, dest_path="."):
return dest_path
def _download_fasttext_vectors(download_dir, file_name="wiki.en.zip"):
def _download_fasttext_vectors(download_dir, file_name="wiki.simple.zip"):
""" Downloads pre-trained word vectors for English, trained on Wikipedia using
fastText. You can directly download the vectors from here:
https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip
https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.zip
For the full version of pre-trained word vectors, change the url for
FASTTEXT_EN_URL to https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip
in __init__.py
Args:
download_dir (str): File path to download the file
@ -75,7 +79,7 @@ def _maybe_download_and_extract(dest_path, file_name):
return file_path
def load_pretrained_vectors(dest_path, file_name="wiki.en.bin"):
def load_pretrained_vectors(dest_path, file_name="wiki.simple.bin"):
""" Method that loads fastText vectors. Downloads if it doesn't exist.
Args: