1
0
Форкнуть 0
UCOSP-winter-2018_TrackingT.../dataExplorationScript.md

16 KiB

import boto3
import botocore
import json
import pandas as pd
import utils.load_data_util

# Pandas Display Settings to allow the dataframe to display in one view
pd.set_option('display.max_columns', 500)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_rows', 50000)
s3 = boto3.resource('s3')
# Helper function to trim the json files into a proper json format
def process_string(data):
    return "[" + data[1:-1] + "]"

#Helper function to count the occurance of a given key
def count_key(data, key, key_value_count):
    for site in data :
        key_value = site[key]
        key_value_count[key_value] = key_value_count.get(key_value, 0) + 1

result = utils.load_data_util.load_random_data(50)
unique_args = result.arguments.unique()
count = 0
with open("uniqueArgs.txt", "wb") as f:
    for arg in unique_args:
        count += 1
        f.write((str(arg)+"\n").encode("utf-8"))
grouped_by_symbol = result.groupby(['symbol']).count()
grouped_by_symbol

arguments call_stack crawl_id file_number func_name in_iframe location operation script_col script_line script_loc_eval script_url time_stamp value
symbol
CanvasRenderingContext2D.fillRect 1 1 1 1 1 1 1 1 1 1 1 1 1 1
CanvasRenderingContext2D.fillStyle 0 2 2 2 2 2 2 2 2 2 2 2 2 2
CanvasRenderingContext2D.textBaseline 0 1 1 1 1 1 1 1 1 1 1 1 1 1
HTMLCanvasElement.getContext 3 3 3 3 3 3 3 3 3 3 3 3 3 3
HTMLCanvasElement.height 0 1 1 1 1 1 1 1 1 1 1 1 1 1
HTMLCanvasElement.style 0 1 1 1 1 1 1 1 1 1 1 1 1 1
HTMLCanvasElement.width 0 1 1 1 1 1 1 1 1 1 1 1 1 1
RTCPeerConnection.iceGatheringState 0 2 2 2 2 2 2 2 2 2 2 2 2 2
RTCPeerConnection.idpLoginUrl 0 1 1 1 1 1 1 1 1 1 1 1 1 1
RTCPeerConnection.localDescription 0 2 2 2 2 2 2 2 2 2 2 2 2 2
RTCPeerConnection.onicecandidate 0 2 2 2 2 2 2 2 2 2 2 2 2 2
RTCPeerConnection.onremovestream 0 1 1 1 1 1 1 1 1 1 1 1 1 1
RTCPeerConnection.peerIdentity 0 1 1 1 1 1 1 1 1 1 1 1 1 1
RTCPeerConnection.remoteDescription 0 2 2 2 2 2 2 2 2 2 2 2 2 2
RTCPeerConnection.signalingState 0 2 2 2 2 2 2 2 2 2 2 2 2 2
window.Storage.getItem 182 182 182 182 182 182 182 182 182 182 182 182 182 182
window.Storage.key 3 3 3 3 3 3 3 3 3 3 3 3 3 3
window.Storage.length 0 5 5 5 5 5 5 5 5 5 5 5 5 5
window.Storage.removeItem 35 35 35 35 35 35 35 35 35 35 35 35 35 35
window.Storage.setItem 49 49 49 49 49 49 49 49 49 49 49 49 49 49
window.document.cookie 0 479 479 479 479 479 479 479 479 479 479 479 479 479
window.localStorage 0 94 94 94 94 94 94 94 94 94 94 94 94 94
window.name 0 31 31 31 31 31 31 31 31 31 31 31 31 31
window.navigator.appCodeName 0 2 2 2 2 2 2 2 2 2 2 2 2 2
window.navigator.appName 0 20 20 20 20 20 20 20 20 20 20 20 20 20
window.navigator.appVersion 0 1 1 1 1 1 1 1 1 1 1 1 1 1
window.navigator.cookieEnabled 0 14 14 14 14 14 14 14 14 14 14 14 14 14
window.navigator.language 0 21 21 21 21 21 21 21 21 21 21 21 21 21
window.navigator.mimeTypes[application/futuresplash].type 0 4 4 4 4 4 4 4 4 4 4 4 4 4
window.navigator.mimeTypes[application/x-shockwave-flash].type 0 3 3 3 3 3 3 3 3 3 3 3 3 3
window.navigator.onLine 0 1 1 1 1 1 1 1 1 1 1 1 1 1
window.navigator.platform 0 23 23 23 23 23 23 23 23 23 23 23 23 23
window.navigator.plugins[Shockwave Flash].description 0 39 39 39 39 39 39 39 39 39 39 39 39 39
window.navigator.plugins[Shockwave Flash].filename 0 7 7 7 7 7 7 7 7 7 7 7 7 7
window.navigator.plugins[Shockwave Flash].length 0 9 9 9 9 9 9 9 9 9 9 9 9 9
window.navigator.plugins[Shockwave Flash].name 0 10 10 10 10 10 10 10 10 10 10 10 10 10
window.navigator.plugins[Shockwave Flash].version 0 7 7 7 7 7 7 7 7 7 7 7 7 7
window.navigator.product 0 2 2 2 2 2 2 2 2 2 2 2 2 2
window.navigator.productSub 0 2 2 2 2 2 2 2 2 2 2 2 2 2
window.navigator.userAgent 0 258 258 258 258 258 258 258 258 258 258 258 258 258
window.navigator.vendor 0 7 7 7 7 7 7 7 7 7 7 7 7 7
window.navigator.vendorSub 0 1 1 1 1 1 1 1 1 1 1 1 1 1
window.screen.colorDepth 0 22 22 22 22 22 22 22 22 22 22 22 22 22
window.screen.pixelDepth 0 5 5 5 5 5 5 5 5 5 5 5 5 5
window.sessionStorage 0 65 65 65 65 65 65 65 65 65 65 65 65 65
result.corr()

crawl_id file_number in_iframe
crawl_id NaN NaN NaN
file_number NaN 1.000000 0.137485
in_iframe NaN 0.137485 1.000000