diff --git a/DataScience/ActionSetVisualization.py b/DataScience/ActionSetVisualization.py index fb91adcf..1d411818 100644 --- a/DataScience/ActionSetVisualization.py +++ b/DataScience/ActionSetVisualization.py @@ -21,7 +21,7 @@ def update(files, dt_str=13): if x.startswith(b'{"_label') and x.strip().endswith(b'}'): data = ds_parse.json_cooked(x) - if data['a'] <= 0: + if data is None: continue c_imp_all.update([data['ts'][:dt_str]]) diff --git a/DataScience/LogDownloader.py b/DataScience/LogDownloader.py index db549ac7..6281c210 100644 --- a/DataScience/LogDownloader.py +++ b/DataScience/LogDownloader.py @@ -322,7 +322,7 @@ def download_container(app_id, log_dir, container=None, conn_string=None, accoun for x in open(fn, 'rb'): if x.startswith(b'{"_label_cost') and x.strip().endswith(b'}'): # reading only cooked lined data = ds_parse.json_cooked(x) - if data['ei'] not in d or float(data['cost']) < d[data['ei']][1]: # taking line with best reward + if data is not None and (data['ei'] not in d or float(data['cost']) < d[data['ei']][1]): # taking line with best reward d[data['ei']] = (data['ts'], float(data['cost']), x) print(' - len(d): {}'.format(len(d))) diff --git a/DataScience/RankRewardAnalyzer.py b/DataScience/RankRewardAnalyzer.py index 436d10b0..cb7e0c5f 100644 --- a/DataScience/RankRewardAnalyzer.py +++ b/DataScience/RankRewardAnalyzer.py @@ -122,6 +122,8 @@ def print_stats(local_fp, azure_path, verbose=False, plot_hist=False, hist_bin=1 if x.startswith(b'{"_label_cost":'): data = ds_parse.json_cooked(x) + if data is None: + continue ei = str(data['ei'], 'utf-8') c = str(data['cost'], 'utf-8') azure_data.append((ei, c)) diff --git a/DataScience/dashboard_utils.py b/DataScience/dashboard_utils.py index 9674fd8d..2e58393e 100644 --- a/DataScience/dashboard_utils.py +++ b/DataScience/dashboard_utils.py @@ -144,8 +144,8 @@ def create_stats(log_fp, d=None, predictions_files=None): if x.startswith(b'{"_label_cost":') and x.strip().endswith(b'}'): data = ds_parse.json_cooked(x) - # Skip not activated lines or wrongly formated lines - if data['skipLearn'] or data['p'] < 1e-10 or data['num_a'] < 1 or data['a'] < 1: + # Skip wrongly formated lines or not activated lines + if data is None or data['skipLearn']: continue if data['cost'] == b'0': diff --git a/DataScience/ds_parse.py b/DataScience/ds_parse.py index 73a490c7..a23493cf 100644 --- a/DataScience/ds_parse.py +++ b/DataScience/ds_parse.py @@ -58,11 +58,14 @@ def process_dsjson_file(fp, d=None, e=None): continue if not (x.startswith(b'{"') and x.strip().endswith(b'}')): - print('Corrupted line: {}'.format(x)) + print('Skipping corrupted line {}: Missing brackets \{ \}'.format(i)) continue if x.startswith(b'{"_label_cost":'): data = json_cooked(x) + if data is None: + print('Skipping corrupted line {}: data is None'.format(i)) + continue if data['skipLearn']: # Ignore not activated lines not_activated += 1 @@ -77,8 +80,6 @@ def process_dsjson_file(fp, d=None, e=None): stats[data['a']] = [0,0,0,0,0,0] stats[data['a']][5] += 1 - if data['p'] <= 0: - continue stats[data['a']][4] += 1/data['p'] baselineRandom[1] += 1/data['p']/data['num_a'] @@ -145,7 +146,9 @@ def json_cooked(x, do_devType=False, do_VWState=False, do_p_vec=False): ind5 = ind4+13 # len('","EventId":"') = 13 ind6 = x.find(b'"',ind5) ind7 = x.find(b',"a"',ind5) - ind8 = x.find(b']',ind7+7) # equal to: x.find('],"c',ind7+8) + ind8 = x.find(b'],"c"',ind7+7) + if ind8 == -1: + return None data = {} data['o'] = 1 if b',"o":' in x[ind2+30:ind2+50] else 0 @@ -157,7 +160,10 @@ def json_cooked(x, do_devType=False, do_VWState=False, do_p_vec=False): data['a'] = int(data['a_vec'][0]) data['num_a'] = len(data['a_vec']) data['skipLearn'] = b'"_skipLearn":true' in x[ind2+34:ind3] # len('"_label_Action":1,"_labelIndex":0,') = 34 - + + if data['p'] < 1e-10 or data['a'] < 1 or data['num_a'] < 1: + return None + if do_VWState: ind11 = x[-120:].find(b'VWState') data['model_id'] = x[-120+ind11+15:-4] if ind11 > -1 else b'N/A'