зеркало из https://github.com/openwpm/OpenWPM.git
247 строки
8.9 KiB
Python
247 строки
8.9 KiB
Python
"""
|
|
Arrow schema for our ArrowProvider.py
|
|
|
|
IF YOU CHANGE THIS FILE ALSO CHANGE schema.sql and test_values.py
|
|
AND Schema-Documentation.md
|
|
|
|
"""
|
|
|
|
import pyarrow as pa
|
|
|
|
PQ_SCHEMAS = dict()
|
|
|
|
fields = [
|
|
pa.field("task_id", pa.int64(), nullable=False),
|
|
pa.field("manager_params", pa.string(), nullable=False),
|
|
pa.field("openwpm_version", pa.string(), nullable=False),
|
|
pa.field("browser_version", pa.string(), nullable=False),
|
|
pa.field("instance_id", pa.uint32(), nullable=False),
|
|
]
|
|
PQ_SCHEMAS["task"] = pa.schema(fields)
|
|
|
|
fields = [
|
|
pa.field("browser_id", pa.uint32(), nullable=False),
|
|
pa.field("task_id", pa.int64(), nullable=False),
|
|
pa.field("browser_params", pa.string(), nullable=False),
|
|
pa.field("instance_id", pa.uint32(), nullable=False),
|
|
]
|
|
PQ_SCHEMAS["crawl"] = pa.schema(fields)
|
|
|
|
# site_visits
|
|
fields = [
|
|
pa.field("visit_id", pa.int64(), nullable=False),
|
|
pa.field("browser_id", pa.uint32(), nullable=False),
|
|
pa.field("instance_id", pa.uint32(), nullable=False),
|
|
pa.field("site_url", pa.string(), nullable=False),
|
|
pa.field("site_rank", pa.uint32()),
|
|
]
|
|
PQ_SCHEMAS["site_visits"] = pa.schema(fields)
|
|
|
|
# crawl_history
|
|
fields = [
|
|
pa.field("browser_id", pa.uint32(), nullable=False),
|
|
pa.field("visit_id", pa.int64(), nullable=False),
|
|
pa.field("instance_id", pa.uint32(), nullable=False),
|
|
pa.field("command", pa.string()),
|
|
pa.field("arguments", pa.string()),
|
|
pa.field("retry_number", pa.int8()),
|
|
pa.field("command_status", pa.string()),
|
|
pa.field("error", pa.string()),
|
|
pa.field("traceback", pa.string()),
|
|
pa.field("duration", pa.int64()),
|
|
]
|
|
PQ_SCHEMAS["crawl_history"] = pa.schema(fields)
|
|
|
|
# http_requests
|
|
fields = [
|
|
pa.field("incognito", pa.int32()),
|
|
pa.field("browser_id", pa.uint32()),
|
|
pa.field("visit_id", pa.int64()),
|
|
pa.field("instance_id", pa.uint32(), nullable=False),
|
|
pa.field("extension_session_uuid", pa.string()),
|
|
pa.field("event_ordinal", pa.int64()),
|
|
pa.field("window_id", pa.int64()),
|
|
pa.field("tab_id", pa.int64()),
|
|
pa.field("frame_id", pa.int64()),
|
|
pa.field("url", pa.string(), nullable=False),
|
|
pa.field("top_level_url", pa.string()),
|
|
pa.field("parent_frame_id", pa.int64()),
|
|
pa.field("frame_ancestors", pa.string()),
|
|
pa.field("method", pa.string(), nullable=False),
|
|
pa.field("referrer", pa.string(), nullable=False),
|
|
pa.field("headers", pa.string(), nullable=False),
|
|
pa.field("request_id", pa.int64(), nullable=False),
|
|
pa.field("is_XHR", pa.bool_()),
|
|
pa.field("is_third_party_channel", pa.bool_()),
|
|
pa.field("is_third_party_to_top_window", pa.bool_()),
|
|
pa.field("triggering_origin", pa.string()),
|
|
pa.field("loading_origin", pa.string()),
|
|
pa.field("loading_href", pa.string()),
|
|
pa.field("req_call_stack", pa.string()),
|
|
pa.field("resource_type", pa.string(), nullable=False),
|
|
pa.field("post_body", pa.string()),
|
|
pa.field("post_body_raw", pa.string()),
|
|
pa.field("time_stamp", pa.string(), nullable=False),
|
|
]
|
|
PQ_SCHEMAS["http_requests"] = pa.schema(fields)
|
|
|
|
# http_responses
|
|
fields = [
|
|
pa.field("incognito", pa.int32()),
|
|
pa.field("browser_id", pa.uint32()),
|
|
pa.field("visit_id", pa.int64()),
|
|
pa.field("instance_id", pa.uint32(), nullable=False),
|
|
pa.field("extension_session_uuid", pa.string()),
|
|
pa.field("event_ordinal", pa.int64()),
|
|
pa.field("window_id", pa.int64()),
|
|
pa.field("tab_id", pa.int64()),
|
|
pa.field("frame_id", pa.int64()),
|
|
pa.field("url", pa.string(), nullable=False),
|
|
pa.field("method", pa.string(), nullable=False),
|
|
pa.field("response_status", pa.int64()),
|
|
pa.field("response_status_text", pa.string(), nullable=False),
|
|
pa.field("is_cached", pa.bool_(), nullable=False),
|
|
pa.field("headers", pa.string(), nullable=False),
|
|
pa.field("request_id", pa.int64(), nullable=False),
|
|
pa.field("location", pa.string(), nullable=False),
|
|
pa.field("time_stamp", pa.string(), nullable=False),
|
|
pa.field("content_hash", pa.string()),
|
|
]
|
|
PQ_SCHEMAS["http_responses"] = pa.schema(fields)
|
|
|
|
# http_redirects
|
|
fields = [
|
|
pa.field("incognito", pa.int32()),
|
|
pa.field("browser_id", pa.uint32()),
|
|
pa.field("visit_id", pa.int64()),
|
|
pa.field("instance_id", pa.uint32(), nullable=False),
|
|
pa.field("old_request_url", pa.string()),
|
|
pa.field("old_request_id", pa.string()),
|
|
pa.field("new_request_url", pa.string()),
|
|
pa.field("new_request_id", pa.string()),
|
|
pa.field("extension_session_uuid", pa.string()),
|
|
pa.field("event_ordinal", pa.int64()),
|
|
pa.field("window_id", pa.int64()),
|
|
pa.field("tab_id", pa.int64()),
|
|
pa.field("frame_id", pa.int64()),
|
|
pa.field("response_status", pa.int64()),
|
|
pa.field("response_status_text", pa.string(), nullable=False),
|
|
pa.field("headers", pa.string()),
|
|
pa.field("time_stamp", pa.string(), nullable=False),
|
|
]
|
|
PQ_SCHEMAS["http_redirects"] = pa.schema(fields)
|
|
|
|
# javascript
|
|
fields = [
|
|
pa.field("incognito", pa.int32()),
|
|
pa.field("browser_id", pa.uint32()),
|
|
pa.field("visit_id", pa.int64()),
|
|
pa.field("instance_id", pa.uint32(), nullable=False),
|
|
pa.field("extension_session_uuid", pa.string()),
|
|
pa.field("event_ordinal", pa.int64()),
|
|
pa.field("page_scoped_event_ordinal", pa.int64()),
|
|
pa.field("window_id", pa.int64()),
|
|
pa.field("tab_id", pa.int64()),
|
|
pa.field("frame_id", pa.int64()),
|
|
pa.field("script_url", pa.string()),
|
|
pa.field("script_line", pa.string()),
|
|
pa.field("script_col", pa.string()),
|
|
pa.field("func_name", pa.string()),
|
|
pa.field("script_loc_eval", pa.string()),
|
|
pa.field("document_url", pa.string()),
|
|
pa.field("top_level_url", pa.string()),
|
|
pa.field("call_stack", pa.string()),
|
|
pa.field("symbol", pa.string()),
|
|
pa.field("operation", pa.string()),
|
|
pa.field("value", pa.string()),
|
|
pa.field("arguments", pa.string()),
|
|
pa.field("time_stamp", pa.string(), nullable=False),
|
|
]
|
|
PQ_SCHEMAS["javascript"] = pa.schema(fields)
|
|
|
|
# javascript_cookies
|
|
fields = [
|
|
pa.field("browser_id", pa.uint32()),
|
|
pa.field("visit_id", pa.int64()),
|
|
pa.field("instance_id", pa.uint32(), nullable=False),
|
|
pa.field("extension_session_uuid", pa.string()),
|
|
pa.field("event_ordinal", pa.int64()),
|
|
pa.field("record_type", pa.string()),
|
|
pa.field("change_cause", pa.string()),
|
|
pa.field("expiry", pa.string()),
|
|
pa.field("is_http_only", pa.bool_()),
|
|
pa.field("is_host_only", pa.bool_()),
|
|
pa.field("is_session", pa.bool_()),
|
|
pa.field("host", pa.string()),
|
|
pa.field("is_secure", pa.bool_()),
|
|
pa.field("name", pa.string()),
|
|
pa.field("path", pa.string()),
|
|
pa.field("value", pa.string()),
|
|
pa.field("same_site", pa.string()),
|
|
pa.field("first_party_domain", pa.string()),
|
|
pa.field("store_id", pa.string()),
|
|
pa.field("time_stamp", pa.string()),
|
|
]
|
|
PQ_SCHEMAS["javascript_cookies"] = pa.schema(fields)
|
|
|
|
# navigations
|
|
fields = [
|
|
pa.field("incognito", pa.int32()),
|
|
pa.field("browser_id", pa.uint32()),
|
|
pa.field("visit_id", pa.int64()),
|
|
pa.field("instance_id", pa.uint32(), nullable=False),
|
|
pa.field("extension_session_uuid", pa.string()),
|
|
pa.field("process_id", pa.int64()),
|
|
pa.field("window_id", pa.int64()),
|
|
pa.field("tab_id", pa.int64()),
|
|
pa.field("tab_opener_tab_id", pa.int64()),
|
|
pa.field("frame_id", pa.int64()),
|
|
pa.field("parent_frame_id", pa.int64()),
|
|
pa.field("window_width", pa.int64()),
|
|
pa.field("window_height", pa.int64()),
|
|
pa.field("window_type", pa.string()),
|
|
pa.field("tab_width", pa.int64()),
|
|
pa.field("tab_height", pa.int64()),
|
|
pa.field("tab_cookie_store_id", pa.string()),
|
|
pa.field("uuid", pa.string()),
|
|
pa.field("url", pa.string()),
|
|
pa.field("transition_qualifiers", pa.string()),
|
|
pa.field("transition_type", pa.string()),
|
|
pa.field("before_navigate_event_ordinal", pa.int64()),
|
|
pa.field("before_navigate_time_stamp", pa.string()),
|
|
pa.field("committed_event_ordinal", pa.int64()),
|
|
pa.field("time_stamp", pa.string()),
|
|
]
|
|
PQ_SCHEMAS["navigations"] = pa.schema(fields)
|
|
|
|
# callstacks
|
|
fields = [
|
|
pa.field("visit_id", pa.int64(), nullable=False),
|
|
pa.field("request_id", pa.int64(), nullable=False),
|
|
pa.field("browser_id", pa.uint32(), nullable=False),
|
|
pa.field("instance_id", pa.uint32(), nullable=False),
|
|
pa.field("call_stack", pa.string()),
|
|
]
|
|
PQ_SCHEMAS["callstacks"] = pa.schema(fields)
|
|
|
|
# incomplete_visits
|
|
fields = [
|
|
pa.field("visit_id", pa.int64(), nullable=False),
|
|
pa.field("instance_id", pa.uint32(), nullable=False),
|
|
]
|
|
PQ_SCHEMAS["incomplete_visits"] = pa.schema(fields)
|
|
|
|
# dns_responses
|
|
fields = [
|
|
pa.field("request_id", pa.int64(), nullable=False),
|
|
pa.field("browser_id", pa.uint32(), nullable=False),
|
|
pa.field("visit_id", pa.int64(), nullable=False),
|
|
pa.field("hostname", pa.string()),
|
|
pa.field("addresses", pa.string()),
|
|
pa.field("canonical_name", pa.string()),
|
|
pa.field("is_TRR", pa.bool_()),
|
|
pa.field("time_stamp", pa.string(), nullable=False),
|
|
pa.field("instance_id", pa.uint32(), nullable=False),
|
|
]
|
|
PQ_SCHEMAS["dns_responses"] = pa.schema(fields)
|