OpenWPM/openwpm/storage/parquet_schema.py

247 строки
8.9 KiB
Python

"""
Arrow schema for our ArrowProvider.py
IF YOU CHANGE THIS FILE ALSO CHANGE schema.sql and test_values.py
AND Schema-Documentation.md
"""
import pyarrow as pa
PQ_SCHEMAS = dict()
fields = [
pa.field("task_id", pa.int64(), nullable=False),
pa.field("manager_params", pa.string(), nullable=False),
pa.field("openwpm_version", pa.string(), nullable=False),
pa.field("browser_version", pa.string(), nullable=False),
pa.field("instance_id", pa.uint32(), nullable=False),
]
PQ_SCHEMAS["task"] = pa.schema(fields)
fields = [
pa.field("browser_id", pa.uint32(), nullable=False),
pa.field("task_id", pa.int64(), nullable=False),
pa.field("browser_params", pa.string(), nullable=False),
pa.field("instance_id", pa.uint32(), nullable=False),
]
PQ_SCHEMAS["crawl"] = pa.schema(fields)
# site_visits
fields = [
pa.field("visit_id", pa.int64(), nullable=False),
pa.field("browser_id", pa.uint32(), nullable=False),
pa.field("instance_id", pa.uint32(), nullable=False),
pa.field("site_url", pa.string(), nullable=False),
pa.field("site_rank", pa.uint32()),
]
PQ_SCHEMAS["site_visits"] = pa.schema(fields)
# crawl_history
fields = [
pa.field("browser_id", pa.uint32(), nullable=False),
pa.field("visit_id", pa.int64(), nullable=False),
pa.field("instance_id", pa.uint32(), nullable=False),
pa.field("command", pa.string()),
pa.field("arguments", pa.string()),
pa.field("retry_number", pa.int8()),
pa.field("command_status", pa.string()),
pa.field("error", pa.string()),
pa.field("traceback", pa.string()),
pa.field("duration", pa.int64()),
]
PQ_SCHEMAS["crawl_history"] = pa.schema(fields)
# http_requests
fields = [
pa.field("incognito", pa.int32()),
pa.field("browser_id", pa.uint32()),
pa.field("visit_id", pa.int64()),
pa.field("instance_id", pa.uint32(), nullable=False),
pa.field("extension_session_uuid", pa.string()),
pa.field("event_ordinal", pa.int64()),
pa.field("window_id", pa.int64()),
pa.field("tab_id", pa.int64()),
pa.field("frame_id", pa.int64()),
pa.field("url", pa.string(), nullable=False),
pa.field("top_level_url", pa.string()),
pa.field("parent_frame_id", pa.int64()),
pa.field("frame_ancestors", pa.string()),
pa.field("method", pa.string(), nullable=False),
pa.field("referrer", pa.string(), nullable=False),
pa.field("headers", pa.string(), nullable=False),
pa.field("request_id", pa.int64(), nullable=False),
pa.field("is_XHR", pa.bool_()),
pa.field("is_third_party_channel", pa.bool_()),
pa.field("is_third_party_to_top_window", pa.bool_()),
pa.field("triggering_origin", pa.string()),
pa.field("loading_origin", pa.string()),
pa.field("loading_href", pa.string()),
pa.field("req_call_stack", pa.string()),
pa.field("resource_type", pa.string(), nullable=False),
pa.field("post_body", pa.string()),
pa.field("post_body_raw", pa.string()),
pa.field("time_stamp", pa.string(), nullable=False),
]
PQ_SCHEMAS["http_requests"] = pa.schema(fields)
# http_responses
fields = [
pa.field("incognito", pa.int32()),
pa.field("browser_id", pa.uint32()),
pa.field("visit_id", pa.int64()),
pa.field("instance_id", pa.uint32(), nullable=False),
pa.field("extension_session_uuid", pa.string()),
pa.field("event_ordinal", pa.int64()),
pa.field("window_id", pa.int64()),
pa.field("tab_id", pa.int64()),
pa.field("frame_id", pa.int64()),
pa.field("url", pa.string(), nullable=False),
pa.field("method", pa.string(), nullable=False),
pa.field("response_status", pa.int64()),
pa.field("response_status_text", pa.string(), nullable=False),
pa.field("is_cached", pa.bool_(), nullable=False),
pa.field("headers", pa.string(), nullable=False),
pa.field("request_id", pa.int64(), nullable=False),
pa.field("location", pa.string(), nullable=False),
pa.field("time_stamp", pa.string(), nullable=False),
pa.field("content_hash", pa.string()),
]
PQ_SCHEMAS["http_responses"] = pa.schema(fields)
# http_redirects
fields = [
pa.field("incognito", pa.int32()),
pa.field("browser_id", pa.uint32()),
pa.field("visit_id", pa.int64()),
pa.field("instance_id", pa.uint32(), nullable=False),
pa.field("old_request_url", pa.string()),
pa.field("old_request_id", pa.string()),
pa.field("new_request_url", pa.string()),
pa.field("new_request_id", pa.string()),
pa.field("extension_session_uuid", pa.string()),
pa.field("event_ordinal", pa.int64()),
pa.field("window_id", pa.int64()),
pa.field("tab_id", pa.int64()),
pa.field("frame_id", pa.int64()),
pa.field("response_status", pa.int64()),
pa.field("response_status_text", pa.string(), nullable=False),
pa.field("headers", pa.string()),
pa.field("time_stamp", pa.string(), nullable=False),
]
PQ_SCHEMAS["http_redirects"] = pa.schema(fields)
# javascript
fields = [
pa.field("incognito", pa.int32()),
pa.field("browser_id", pa.uint32()),
pa.field("visit_id", pa.int64()),
pa.field("instance_id", pa.uint32(), nullable=False),
pa.field("extension_session_uuid", pa.string()),
pa.field("event_ordinal", pa.int64()),
pa.field("page_scoped_event_ordinal", pa.int64()),
pa.field("window_id", pa.int64()),
pa.field("tab_id", pa.int64()),
pa.field("frame_id", pa.int64()),
pa.field("script_url", pa.string()),
pa.field("script_line", pa.string()),
pa.field("script_col", pa.string()),
pa.field("func_name", pa.string()),
pa.field("script_loc_eval", pa.string()),
pa.field("document_url", pa.string()),
pa.field("top_level_url", pa.string()),
pa.field("call_stack", pa.string()),
pa.field("symbol", pa.string()),
pa.field("operation", pa.string()),
pa.field("value", pa.string()),
pa.field("arguments", pa.string()),
pa.field("time_stamp", pa.string(), nullable=False),
]
PQ_SCHEMAS["javascript"] = pa.schema(fields)
# javascript_cookies
fields = [
pa.field("browser_id", pa.uint32()),
pa.field("visit_id", pa.int64()),
pa.field("instance_id", pa.uint32(), nullable=False),
pa.field("extension_session_uuid", pa.string()),
pa.field("event_ordinal", pa.int64()),
pa.field("record_type", pa.string()),
pa.field("change_cause", pa.string()),
pa.field("expiry", pa.string()),
pa.field("is_http_only", pa.bool_()),
pa.field("is_host_only", pa.bool_()),
pa.field("is_session", pa.bool_()),
pa.field("host", pa.string()),
pa.field("is_secure", pa.bool_()),
pa.field("name", pa.string()),
pa.field("path", pa.string()),
pa.field("value", pa.string()),
pa.field("same_site", pa.string()),
pa.field("first_party_domain", pa.string()),
pa.field("store_id", pa.string()),
pa.field("time_stamp", pa.string()),
]
PQ_SCHEMAS["javascript_cookies"] = pa.schema(fields)
# navigations
fields = [
pa.field("incognito", pa.int32()),
pa.field("browser_id", pa.uint32()),
pa.field("visit_id", pa.int64()),
pa.field("instance_id", pa.uint32(), nullable=False),
pa.field("extension_session_uuid", pa.string()),
pa.field("process_id", pa.int64()),
pa.field("window_id", pa.int64()),
pa.field("tab_id", pa.int64()),
pa.field("tab_opener_tab_id", pa.int64()),
pa.field("frame_id", pa.int64()),
pa.field("parent_frame_id", pa.int64()),
pa.field("window_width", pa.int64()),
pa.field("window_height", pa.int64()),
pa.field("window_type", pa.string()),
pa.field("tab_width", pa.int64()),
pa.field("tab_height", pa.int64()),
pa.field("tab_cookie_store_id", pa.string()),
pa.field("uuid", pa.string()),
pa.field("url", pa.string()),
pa.field("transition_qualifiers", pa.string()),
pa.field("transition_type", pa.string()),
pa.field("before_navigate_event_ordinal", pa.int64()),
pa.field("before_navigate_time_stamp", pa.string()),
pa.field("committed_event_ordinal", pa.int64()),
pa.field("time_stamp", pa.string()),
]
PQ_SCHEMAS["navigations"] = pa.schema(fields)
# callstacks
fields = [
pa.field("visit_id", pa.int64(), nullable=False),
pa.field("request_id", pa.int64(), nullable=False),
pa.field("browser_id", pa.uint32(), nullable=False),
pa.field("instance_id", pa.uint32(), nullable=False),
pa.field("call_stack", pa.string()),
]
PQ_SCHEMAS["callstacks"] = pa.schema(fields)
# incomplete_visits
fields = [
pa.field("visit_id", pa.int64(), nullable=False),
pa.field("instance_id", pa.uint32(), nullable=False),
]
PQ_SCHEMAS["incomplete_visits"] = pa.schema(fields)
# dns_responses
fields = [
pa.field("request_id", pa.int64(), nullable=False),
pa.field("browser_id", pa.uint32(), nullable=False),
pa.field("visit_id", pa.int64(), nullable=False),
pa.field("hostname", pa.string()),
pa.field("addresses", pa.string()),
pa.field("canonical_name", pa.string()),
pa.field("is_TRR", pa.bool_()),
pa.field("time_stamp", pa.string(), nullable=False),
pa.field("instance_id", pa.uint32(), nullable=False),
]
PQ_SCHEMAS["dns_responses"] = pa.schema(fields)