Removing reference to download_data
This commit is contained in:
Родитель
eac4bdce59
Коммит
06f68fdf1e
|
@ -1,119 +1,119 @@
|
|||
## ==============================================================================
|
||||
## Copyright (c) Microsoft. All rights reserved.
|
||||
## Licensed under the MIT license. See LICENSE.md file in the project root
|
||||
## for full license information.
|
||||
## ==============================================================================
|
||||
# ==============================================================================
|
||||
# Copyright (c) Microsoft. All rights reserved.
|
||||
# Licensed under the MIT license. See LICENSE.md file in the project root
|
||||
# for full license information.
|
||||
# ==============================================================================
|
||||
|
||||
## This program downloads training, validation and test data and creates additional files
|
||||
## with token-ids and frequencies.
|
||||
# This program downloads training, validation and test data and creates additional files
|
||||
# with token-ids and frequencies.
|
||||
|
||||
#import urllib.request, os, sys, tarfile, operator
|
||||
import urllib.request, os, sys, tarfile, operator
|
||||
|
||||
## accumulate word counts in dictionary
|
||||
#def add_to_count(word, word2Count):
|
||||
# if word in word2Count:
|
||||
# word2Count[word] += 1
|
||||
# else:
|
||||
# word2Count[word] = 1
|
||||
# accumulate word counts in dictionary
|
||||
def add_to_count(word, word2Count):
|
||||
if word in word2Count:
|
||||
word2Count[word] += 1
|
||||
else:
|
||||
word2Count[word] = 1
|
||||
|
||||
## for a text file returns a dictionary with the frequency of each word
|
||||
#def count_words_in_file(path):
|
||||
# with open(path,'r') as f:
|
||||
# word2count = {}
|
||||
# for line in f:
|
||||
# words = line.split()
|
||||
# for word in words:
|
||||
# add_to_count(word, word2count)
|
||||
# return word2count
|
||||
# for a text file returns a dictionary with the frequency of each word
|
||||
def count_words_in_file(path):
|
||||
with open(path,'r') as f:
|
||||
word2count = {}
|
||||
for line in f:
|
||||
words = line.split()
|
||||
for word in words:
|
||||
add_to_count(word, word2count)
|
||||
return word2count
|
||||
|
||||
## from a dictionary mapping words to counts creates two files:
|
||||
## * a vocabulary file containing all words sorted by decreasing frequency, one word per line
|
||||
## * a frequency file containing the frequencies of these word, one number per line.
|
||||
#def write_vocab_and_frequencies(word2count, vocab_file_path, freq_file_path, word2count_file_path, word2id_file_path):
|
||||
# vocab_file = open(vocab_file_path,'w', newline='\r\n')
|
||||
# freq_file = open(freq_file_path,'w', newline='\r\n')
|
||||
# word2count_file = open(word2count_file_path,'w', newline='\r\n')
|
||||
# word2id_file = open(word2id_file_path,'w', newline='\r\n')
|
||||
# sorted_entries = sorted(word2count.items(), key = operator.itemgetter(1) , reverse = True)
|
||||
# from a dictionary mapping words to counts creates two files:
|
||||
# * a vocabulary file containing all words sorted by decreasing frequency, one word per line
|
||||
# * a frequency file containing the frequencies of these word, one number per line.
|
||||
def write_vocab_and_frequencies(word2count, vocab_file_path, freq_file_path, word2count_file_path, word2id_file_path):
|
||||
vocab_file = open(vocab_file_path,'w', newline='\r\n')
|
||||
freq_file = open(freq_file_path,'w', newline='\r\n')
|
||||
word2count_file = open(word2count_file_path,'w', newline='\r\n')
|
||||
word2id_file = open(word2id_file_path,'w', newline='\r\n')
|
||||
sorted_entries = sorted(word2count.items(), key = operator.itemgetter(1) , reverse = True)
|
||||
|
||||
# id=int(0)
|
||||
# for word, freq in sorted_entries:
|
||||
# vocab_file.write(word+"\n")
|
||||
# freq_file.write("%i\n" % freq)
|
||||
# word2count_file.writelines("%s\t%i\n" % (word, freq))
|
||||
# word2id_file.writelines("%s\t%i\n" % (word, id))
|
||||
# id +=1
|
||||
id=int(0)
|
||||
for word, freq in sorted_entries:
|
||||
vocab_file.write(word+"\n")
|
||||
freq_file.write("%i\n" % freq)
|
||||
word2count_file.writelines("%s\t%i\n" % (word, freq))
|
||||
word2id_file.writelines("%s\t%i\n" % (word, id))
|
||||
id +=1
|
||||
|
||||
# #close the files
|
||||
# vocab_file.close()
|
||||
# freq_file.close()
|
||||
# word2count_file.close()
|
||||
#close the files
|
||||
vocab_file.close()
|
||||
freq_file.close()
|
||||
word2count_file.close()
|
||||
|
||||
##copy txt file and append '<eos>' at end of each line
|
||||
#def append_eos_and_trim(from_path, to_path, max_lines_in_output = None):
|
||||
# with open(from_path,'r') as f:
|
||||
# lines = f.read().splitlines()
|
||||
#copy txt file and append '<eos>' at end of each line
|
||||
def append_eos_and_trim(from_path, to_path, max_lines_in_output = None):
|
||||
with open(from_path,'r') as f:
|
||||
lines = f.read().splitlines()
|
||||
|
||||
# with open(to_path,'w') as f:
|
||||
# count=0
|
||||
# for line in lines:
|
||||
# count += 1
|
||||
# if max_lines_in_output != None and count > max_lines_in_output:
|
||||
# break
|
||||
with open(to_path,'w') as f:
|
||||
count=0
|
||||
for line in lines:
|
||||
count += 1
|
||||
if max_lines_in_output != None and count > max_lines_in_output:
|
||||
break
|
||||
|
||||
# f.write(line + "<eos>\n")
|
||||
f.write(line + "<eos>\n")
|
||||
|
||||
|
||||
#class Paths(object):
|
||||
class Paths(object):
|
||||
|
||||
# # Relative paths of the data file in the downloaded tar file
|
||||
# tar_path_test = './simple-examples/data/ptb.test.txt'
|
||||
# tar_path_train = './simple-examples/data/ptb.train.txt'
|
||||
# tar_path_validation = './simple-examples/data/ptb.valid.txt'
|
||||
# Relative paths of the data file in the downloaded tar file
|
||||
tar_path_test = './simple-examples/data/ptb.test.txt'
|
||||
tar_path_train = './simple-examples/data/ptb.train.txt'
|
||||
tar_path_validation = './simple-examples/data/ptb.valid.txt'
|
||||
|
||||
# tmp_dir = './tmp/'
|
||||
tmp_dir = './tmp/'
|
||||
|
||||
# # final path of the data files
|
||||
# data_dir = './ptb/'
|
||||
# test = os.path.join(data_dir, 'test.txt')
|
||||
# train = os.path.join(data_dir, 'train.txt')
|
||||
# validation = os.path.join(data_dir, 'valid.txt')
|
||||
# final path of the data files
|
||||
data_dir = './ptb/'
|
||||
test = os.path.join(data_dir, 'test.txt')
|
||||
train = os.path.join(data_dir, 'train.txt')
|
||||
validation = os.path.join(data_dir, 'valid.txt')
|
||||
|
||||
# # files derived from the data files
|
||||
# tokens = os.path.join(data_dir, 'vocab.txt')
|
||||
# frequencies = os.path.join(data_dir, 'freq.txt')
|
||||
# token2frequency = os.path.join(data_dir, 'token2freq.txt')
|
||||
# token2id = os.path.join(data_dir, 'token2id.txt')
|
||||
# files derived from the data files
|
||||
tokens = os.path.join(data_dir, 'vocab.txt')
|
||||
frequencies = os.path.join(data_dir, 'freq.txt')
|
||||
token2frequency = os.path.join(data_dir, 'token2freq.txt')
|
||||
token2id = os.path.join(data_dir, 'token2id.txt')
|
||||
|
||||
|
||||
#if __name__=='__main__':
|
||||
if __name__=='__main__':
|
||||
|
||||
# # downloading the data
|
||||
# url ="http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz"
|
||||
# downloading the data
|
||||
url ="http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz"
|
||||
|
||||
# tmpGz = "tmp.tgz"
|
||||
# if not os.path.isfile(tmpGz):
|
||||
# print("downloading " + url + " to " + tmpGz)
|
||||
# urllib.request.urlretrieve(url, tmpGz)
|
||||
tmpGz = "tmp.tgz"
|
||||
if not os.path.isfile(tmpGz):
|
||||
print("downloading " + url + " to " + tmpGz)
|
||||
urllib.request.urlretrieve(url, tmpGz)
|
||||
|
||||
# # extracting the files we need from the tarfile
|
||||
# fileReader=tarfile.open(tmpGz, 'r')
|
||||
# print("Extracting files into: " + Paths.tmp_dir)
|
||||
# fileReader.extract(Paths.tar_path_test, path = Paths.tmp_dir)
|
||||
# fileReader.extract(Paths.tar_path_train, path = Paths.tmp_dir)
|
||||
# fileReader.extract(Paths.tar_path_validation, path = Paths.tmp_dir)
|
||||
# extracting the files we need from the tarfile
|
||||
fileReader=tarfile.open(tmpGz, 'r')
|
||||
print("Extracting files into: " + Paths.tmp_dir)
|
||||
fileReader.extract(Paths.tar_path_test, path = Paths.tmp_dir)
|
||||
fileReader.extract(Paths.tar_path_train, path = Paths.tmp_dir)
|
||||
fileReader.extract(Paths.tar_path_validation, path = Paths.tmp_dir)
|
||||
|
||||
# print('creating final data files in directory:' + Paths.data_dir)
|
||||
# os.mkdir(Paths.data_dir)
|
||||
# append_eos_and_trim(os.path.join(Paths.tmp_dir, Paths.tar_path_test), Paths.test)
|
||||
# append_eos_and_trim(os.path.join(Paths.tmp_dir, Paths.tar_path_train), Paths.train)
|
||||
# append_eos_and_trim(os.path.join(Paths.tmp_dir, Paths.tar_path_validation), Paths.validation, max_lines_in_output = 50)
|
||||
print('creating final data files in directory:' + Paths.data_dir)
|
||||
os.mkdir(Paths.data_dir)
|
||||
append_eos_and_trim(os.path.join(Paths.tmp_dir, Paths.tar_path_test), Paths.test)
|
||||
append_eos_and_trim(os.path.join(Paths.tmp_dir, Paths.tar_path_train), Paths.train)
|
||||
append_eos_and_trim(os.path.join(Paths.tmp_dir, Paths.tar_path_validation), Paths.validation, max_lines_in_output = 50)
|
||||
|
||||
# fileReader.close()
|
||||
fileReader.close()
|
||||
|
||||
# #removing the temporary file
|
||||
# os.remove(tmpGz)
|
||||
#removing the temporary file
|
||||
os.remove(tmpGz)
|
||||
|
||||
# # from the training file generate a number of helper files
|
||||
# word2count = count_words_in_file(Paths.train)
|
||||
# write_vocab_and_frequencies(word2count, Paths.tokens, Paths.frequencies, Paths.token2frequency, Paths.token2id)
|
||||
# from the training file generate a number of helper files
|
||||
word2count = count_words_in_file(Paths.train)
|
||||
write_vocab_and_frequencies(word2count, Paths.tokens, Paths.frequencies, Paths.token2frequency, Paths.token2id)
|
||||
|
|
|
@ -18,8 +18,6 @@ from cntk.models import For, Sequential
|
|||
from cntk.utils import log_number_of_parameters, ProgressPrinter
|
||||
from data_reader import DataReader
|
||||
from math import log, exp
|
||||
from download_data import Paths
|
||||
|
||||
from cntk.device import set_default_device, cpu, gpu
|
||||
|
||||
# Setting global parameters
|
||||
|
@ -179,12 +177,6 @@ def print_progress(samples_per_second, average_full_ce, total_samples, total_tim
|
|||
|
||||
# Creates and trains an rnn language model.
|
||||
def train_lm():
|
||||
training_text_file = Paths.train
|
||||
validation_text_file = Paths.validation
|
||||
token_to_ix_file_path = Paths.token2id
|
||||
sampling_weights_file_path = Paths.frequencies
|
||||
|
||||
|
||||
data = DataReader(token_to_id_path, segment_sepparator)
|
||||
|
||||
# Create model nodes for the source and target inputs
|
||||
|
|
Загрузка…
Ссылка в новой задаче