ds_parse.py: consolidated all corrupted lines checks into json_cooked()

This commit is contained in:
Marco Rossi 2019-10-24 10:52:03 -07:00
Родитель 8a43cc9954
Коммит ec3161c69d
5 изменённых файлов: 17 добавлений и 9 удалений

Просмотреть файл

@ -21,7 +21,7 @@ def update(files, dt_str=13):
if x.startswith(b'{"_label') and x.strip().endswith(b'}'):
data = ds_parse.json_cooked(x)
if data['a'] <= 0:
if data is None:
continue
c_imp_all.update([data['ts'][:dt_str]])

Просмотреть файл

@ -322,7 +322,7 @@ def download_container(app_id, log_dir, container=None, conn_string=None, accoun
for x in open(fn, 'rb'):
if x.startswith(b'{"_label_cost') and x.strip().endswith(b'}'): # reading only cooked lined
data = ds_parse.json_cooked(x)
if data['ei'] not in d or float(data['cost']) < d[data['ei']][1]: # taking line with best reward
if data is not None and (data['ei'] not in d or float(data['cost']) < d[data['ei']][1]): # taking line with best reward
d[data['ei']] = (data['ts'], float(data['cost']), x)
print(' - len(d): {}'.format(len(d)))

Просмотреть файл

@ -122,6 +122,8 @@ def print_stats(local_fp, azure_path, verbose=False, plot_hist=False, hist_bin=1
if x.startswith(b'{"_label_cost":'):
data = ds_parse.json_cooked(x)
if data is None:
continue
ei = str(data['ei'], 'utf-8')
c = str(data['cost'], 'utf-8')
azure_data.append((ei, c))

Просмотреть файл

@ -144,8 +144,8 @@ def create_stats(log_fp, d=None, predictions_files=None):
if x.startswith(b'{"_label_cost":') and x.strip().endswith(b'}'):
data = ds_parse.json_cooked(x)
# Skip not activated lines or wrongly formated lines
if data['skipLearn'] or data['p'] < 1e-10 or data['num_a'] < 1 or data['a'] < 1:
# Skip wrongly formated lines or not activated lines
if data is None or data['skipLearn']:
continue
if data['cost'] == b'0':

Просмотреть файл

@ -58,11 +58,14 @@ def process_dsjson_file(fp, d=None, e=None):
continue
if not (x.startswith(b'{"') and x.strip().endswith(b'}')):
print('Corrupted line: {}'.format(x))
print('Skipping corrupted line {}: Missing brackets \{ \}'.format(i))
continue
if x.startswith(b'{"_label_cost":'):
data = json_cooked(x)
if data is None:
print('Skipping corrupted line {}: data is None'.format(i))
continue
if data['skipLearn']: # Ignore not activated lines
not_activated += 1
@ -77,8 +80,6 @@ def process_dsjson_file(fp, d=None, e=None):
stats[data['a']] = [0,0,0,0,0,0]
stats[data['a']][5] += 1
if data['p'] <= 0:
continue
stats[data['a']][4] += 1/data['p']
baselineRandom[1] += 1/data['p']/data['num_a']
@ -145,7 +146,9 @@ def json_cooked(x, do_devType=False, do_VWState=False, do_p_vec=False):
ind5 = ind4+13 # len('","EventId":"') = 13
ind6 = x.find(b'"',ind5)
ind7 = x.find(b',"a"',ind5)
ind8 = x.find(b']',ind7+7) # equal to: x.find('],"c',ind7+8)
ind8 = x.find(b'],"c"',ind7+7)
if ind8 == -1:
return None
data = {}
data['o'] = 1 if b',"o":' in x[ind2+30:ind2+50] else 0
@ -157,7 +160,10 @@ def json_cooked(x, do_devType=False, do_VWState=False, do_p_vec=False):
data['a'] = int(data['a_vec'][0])
data['num_a'] = len(data['a_vec'])
data['skipLearn'] = b'"_skipLearn":true' in x[ind2+34:ind3] # len('"_label_Action":1,"_labelIndex":0,') = 34
if data['p'] < 1e-10 or data['a'] < 1 or data['num_a'] < 1:
return None
if do_VWState:
ind11 = x[-120:].find(b'VWState')
data['model_id'] = x[-120+ind11+15:-4] if ind11 > -1 else b'N/A'