зеркало из https://github.com/microsoft/mwt-ds.git
ds_parse.py: consolidated all corrupted lines checks into json_cooked()
This commit is contained in:
Родитель
8a43cc9954
Коммит
ec3161c69d
|
@ -21,7 +21,7 @@ def update(files, dt_str=13):
|
|||
|
||||
if x.startswith(b'{"_label') and x.strip().endswith(b'}'):
|
||||
data = ds_parse.json_cooked(x)
|
||||
if data['a'] <= 0:
|
||||
if data is None:
|
||||
continue
|
||||
|
||||
c_imp_all.update([data['ts'][:dt_str]])
|
||||
|
|
|
@ -322,7 +322,7 @@ def download_container(app_id, log_dir, container=None, conn_string=None, accoun
|
|||
for x in open(fn, 'rb'):
|
||||
if x.startswith(b'{"_label_cost') and x.strip().endswith(b'}'): # reading only cooked lined
|
||||
data = ds_parse.json_cooked(x)
|
||||
if data['ei'] not in d or float(data['cost']) < d[data['ei']][1]: # taking line with best reward
|
||||
if data is not None and (data['ei'] not in d or float(data['cost']) < d[data['ei']][1]): # taking line with best reward
|
||||
d[data['ei']] = (data['ts'], float(data['cost']), x)
|
||||
print(' - len(d): {}'.format(len(d)))
|
||||
|
||||
|
|
|
@ -122,6 +122,8 @@ def print_stats(local_fp, azure_path, verbose=False, plot_hist=False, hist_bin=1
|
|||
|
||||
if x.startswith(b'{"_label_cost":'):
|
||||
data = ds_parse.json_cooked(x)
|
||||
if data is None:
|
||||
continue
|
||||
ei = str(data['ei'], 'utf-8')
|
||||
c = str(data['cost'], 'utf-8')
|
||||
azure_data.append((ei, c))
|
||||
|
|
|
@ -144,8 +144,8 @@ def create_stats(log_fp, d=None, predictions_files=None):
|
|||
if x.startswith(b'{"_label_cost":') and x.strip().endswith(b'}'):
|
||||
data = ds_parse.json_cooked(x)
|
||||
|
||||
# Skip not activated lines or wrongly formated lines
|
||||
if data['skipLearn'] or data['p'] < 1e-10 or data['num_a'] < 1 or data['a'] < 1:
|
||||
# Skip wrongly formated lines or not activated lines
|
||||
if data is None or data['skipLearn']:
|
||||
continue
|
||||
|
||||
if data['cost'] == b'0':
|
||||
|
|
|
@ -58,11 +58,14 @@ def process_dsjson_file(fp, d=None, e=None):
|
|||
continue
|
||||
|
||||
if not (x.startswith(b'{"') and x.strip().endswith(b'}')):
|
||||
print('Corrupted line: {}'.format(x))
|
||||
print('Skipping corrupted line {}: Missing brackets \{ \}'.format(i))
|
||||
continue
|
||||
|
||||
if x.startswith(b'{"_label_cost":'):
|
||||
data = json_cooked(x)
|
||||
if data is None:
|
||||
print('Skipping corrupted line {}: data is None'.format(i))
|
||||
continue
|
||||
|
||||
if data['skipLearn']: # Ignore not activated lines
|
||||
not_activated += 1
|
||||
|
@ -77,8 +80,6 @@ def process_dsjson_file(fp, d=None, e=None):
|
|||
stats[data['a']] = [0,0,0,0,0,0]
|
||||
|
||||
stats[data['a']][5] += 1
|
||||
if data['p'] <= 0:
|
||||
continue
|
||||
|
||||
stats[data['a']][4] += 1/data['p']
|
||||
baselineRandom[1] += 1/data['p']/data['num_a']
|
||||
|
@ -145,7 +146,9 @@ def json_cooked(x, do_devType=False, do_VWState=False, do_p_vec=False):
|
|||
ind5 = ind4+13 # len('","EventId":"') = 13
|
||||
ind6 = x.find(b'"',ind5)
|
||||
ind7 = x.find(b',"a"',ind5)
|
||||
ind8 = x.find(b']',ind7+7) # equal to: x.find('],"c',ind7+8)
|
||||
ind8 = x.find(b'],"c"',ind7+7)
|
||||
if ind8 == -1:
|
||||
return None
|
||||
|
||||
data = {}
|
||||
data['o'] = 1 if b',"o":' in x[ind2+30:ind2+50] else 0
|
||||
|
@ -157,7 +160,10 @@ def json_cooked(x, do_devType=False, do_VWState=False, do_p_vec=False):
|
|||
data['a'] = int(data['a_vec'][0])
|
||||
data['num_a'] = len(data['a_vec'])
|
||||
data['skipLearn'] = b'"_skipLearn":true' in x[ind2+34:ind3] # len('"_label_Action":1,"_labelIndex":0,') = 34
|
||||
|
||||
|
||||
if data['p'] < 1e-10 or data['a'] < 1 or data['num_a'] < 1:
|
||||
return None
|
||||
|
||||
if do_VWState:
|
||||
ind11 = x[-120:].find(b'VWState')
|
||||
data['model_id'] = x[-120+ind11+15:-4] if ind11 > -1 else b'N/A'
|
||||
|
|
Загрузка…
Ссылка в новой задаче