147 строки
4.7 KiB
Python
Executable File
147 строки
4.7 KiB
Python
Executable File
import h5py
|
|
import numpy as np
|
|
import os
|
|
import random
|
|
from struct import pack, unpack, calcsize
|
|
import math
|
|
import argparse
|
|
import copy
|
|
|
|
|
|
class DataReader:
|
|
|
|
def __init__(self,
|
|
filename,
|
|
featuredim,
|
|
batchsize,
|
|
datatype='float32',
|
|
normalize=False,
|
|
targettype='float32'):
|
|
self.mytype = targettype
|
|
if filename.find('.bin') >= 0:
|
|
self.fin = open(filename, 'rb')
|
|
R = unpack('i', self.fin.read(4))[0]
|
|
self.featuredim = unpack('i', self.fin.read(4))[0]
|
|
self.isbinary = True
|
|
self.type = datatype
|
|
print('Open Binary DataReader for data(%d,%d)...' %
|
|
(R, self.featuredim))
|
|
else:
|
|
with open(filename) as f:
|
|
R = sum(1 for _ in f)
|
|
self.fin = open(filename, 'r')
|
|
self.featuredim = featuredim
|
|
self.isbinary = False
|
|
self.type = self.mytype
|
|
|
|
if batchsize <= 0: batchsize = R
|
|
self.query = np.zeros([batchsize, self.featuredim], dtype=self.mytype)
|
|
self.normalize = normalize
|
|
|
|
def norm(self, data):
|
|
square = np.sqrt(np.sum(np.square(data), axis=1))
|
|
data[square < 1e-6] = 1e-6 / math.sqrt(float(self.featuredim))
|
|
square[square < 1e-6] = 1e-6
|
|
data = data / square.reshape([-1, 1])
|
|
return data
|
|
|
|
def readbatch(self):
|
|
numQuerys = self.query.shape[0]
|
|
i = 0
|
|
if self.isbinary:
|
|
while i < numQuerys:
|
|
vec = self.fin.read(
|
|
(np.dtype(self.type).itemsize) * self.featuredim)
|
|
if len(vec) == 0: break
|
|
if len(vec) != (np.dtype(
|
|
self.type).itemsize) * self.featuredim:
|
|
print(
|
|
"%d vector cannot be read correctly: require %d bytes but only read %d bytes"
|
|
% (i, (np.dtype(self.type).itemsize) * self.featuredim,
|
|
len(vec)))
|
|
continue
|
|
self.query[i] = np.frombuffer(vec, dtype=self.type).astype(
|
|
self.mytype)
|
|
i += 1
|
|
else:
|
|
while i < numQuerys:
|
|
line = self.fin.readline()
|
|
if len(line) == 0: break
|
|
|
|
index = line.rfind("\t")
|
|
if index < 0: continue
|
|
|
|
items = line[index + 1:].split("|")
|
|
if len(items) < self.featuredim: continue
|
|
|
|
for j in range(self.featuredim):
|
|
self.query[i, j] = float(items[j])
|
|
i += 1
|
|
print('Load batch query size:%r' % (i))
|
|
if self.normalize != 0: return i, self.norm(self.query[0:i])
|
|
return i, self.query[0:i]
|
|
|
|
def readallbatches(self):
|
|
numQuerys = self.query.shape[0]
|
|
data = []
|
|
R = 0
|
|
while True:
|
|
i, q = self.readbatch()
|
|
if i == numQuerys:
|
|
data.append(copy.deepcopy(q))
|
|
R += i
|
|
else:
|
|
if i > 0:
|
|
data.append(copy.deepcopy(q[0:i]))
|
|
R += i
|
|
break
|
|
return R, np.array(data)
|
|
|
|
def close(self):
|
|
self.fin.close()
|
|
|
|
|
|
def dataset_transform(dataset):
|
|
if dataset.attrs.get('type', 'dense') != 'sparse':
|
|
return np.array(dataset['train']), np.array(dataset['test'])
|
|
return sparse_to_lists(dataset['train'],
|
|
dataset['size_train']), sparse_to_lists(
|
|
dataset['test'], dataset['size_test'])
|
|
|
|
|
|
class HDF5Reader:
|
|
|
|
def __init__(self, filename, data_type='float32'):
|
|
self.data = h5py.File(filename, 'r')
|
|
self._data_type = data_type
|
|
self.featuredim = int(self.data.attrs['dimension']
|
|
) if 'dimension' in self.data.attrs else len(
|
|
self.data['train'][0])
|
|
self.train, self.test = dataset_transform(self.data)
|
|
self.distance = self.data.attrs['distance']
|
|
self.label = np.array(self.data['distances'])
|
|
|
|
def norm(self, data):
|
|
square = np.sqrt(np.sum(np.square(data), axis=1))
|
|
data[square < 1e-6] = 1e-6 / math.sqrt(float(self.featuredim))
|
|
square[square < 1e-6] = 1e-6
|
|
data = data / square.reshape([-1, 1])
|
|
return data
|
|
|
|
def readallbatches(self):
|
|
return np.array(self.train,
|
|
dtype=self._data_type), np.array(self.test,
|
|
dtype=self._data_type)
|
|
|
|
def close(self):
|
|
pass
|
|
|
|
|
|
def sparse_to_lists(data, lengths):
|
|
X = []
|
|
index = 0
|
|
for l in lengths:
|
|
X.append(data[index:index + l])
|
|
index += l
|
|
return X
|