2021-02-22 19:51:32 +03:00
|
|
|
import sqlite3
|
|
|
|
|
2021-01-09 13:15:01 +03:00
|
|
|
from selenium.webdriver import Firefox
|
|
|
|
|
2021-02-22 19:51:32 +03:00
|
|
|
from openwpm import command_sequence
|
2021-01-09 13:15:01 +03:00
|
|
|
from openwpm.commands.types import BaseCommand
|
2021-02-22 19:51:32 +03:00
|
|
|
from openwpm.config import BrowserParams, ManagerParamsInternal
|
2021-01-09 13:15:01 +03:00
|
|
|
from openwpm.socket_interface import ClientSocket
|
2021-02-22 19:51:32 +03:00
|
|
|
from openwpm.storage.sql_provider import SQLiteStorageProvider
|
|
|
|
from openwpm.storage.storage_providers import TableName
|
|
|
|
from openwpm.task_manager import TaskManager
|
2020-11-19 16:24:06 +03:00
|
|
|
from openwpm.utilities import db_utils
|
|
|
|
|
2019-04-26 20:07:40 +03:00
|
|
|
from . import utilities
|
2016-10-12 00:17:05 +03:00
|
|
|
|
2020-09-11 16:14:09 +03:00
|
|
|
url_a = utilities.BASE_TEST_URL + "/simple_a.html"
|
2016-10-12 00:17:05 +03:00
|
|
|
|
2017-10-09 23:29:50 +03:00
|
|
|
PAGE_LINKS = {
|
2020-09-11 16:14:09 +03:00
|
|
|
(
|
2021-04-13 18:39:03 +03:00
|
|
|
f"{utilities.BASE_TEST_URL}/simple_a.html",
|
|
|
|
f"{utilities.BASE_TEST_URL}/simple_c.html",
|
2020-09-11 16:14:09 +03:00
|
|
|
),
|
|
|
|
(
|
2021-04-13 18:39:03 +03:00
|
|
|
f"{utilities.BASE_TEST_URL}/simple_a.html",
|
|
|
|
f"{utilities.BASE_TEST_URL}/simple_d.html",
|
2020-09-11 16:14:09 +03:00
|
|
|
),
|
|
|
|
(
|
2021-04-13 18:39:03 +03:00
|
|
|
f"{utilities.BASE_TEST_URL}/simple_a.html",
|
|
|
|
"http://example.com/test.html?localhost",
|
2020-09-11 16:14:09 +03:00
|
|
|
),
|
2017-10-09 23:29:50 +03:00
|
|
|
}
|
|
|
|
|
2016-12-18 20:42:45 +03:00
|
|
|
|
2021-01-09 13:15:01 +03:00
|
|
|
class CollectLinksCommand(BaseCommand):
|
2021-05-10 19:48:47 +03:00
|
|
|
"""Collect links with `scheme` and save in table `table_name`"""
|
2021-01-09 13:15:01 +03:00
|
|
|
|
2021-02-22 19:51:32 +03:00
|
|
|
def __init__(self, table_name: TableName, scheme: str) -> None:
|
2021-01-09 13:15:01 +03:00
|
|
|
self.scheme = scheme
|
|
|
|
self.table_name = table_name
|
|
|
|
|
|
|
|
def execute(
|
|
|
|
self,
|
|
|
|
webdriver: Firefox,
|
|
|
|
browser_params: BrowserParams,
|
2021-02-22 19:51:32 +03:00
|
|
|
manager_params: ManagerParamsInternal,
|
2021-01-09 13:15:01 +03:00
|
|
|
extension_socket: ClientSocket,
|
|
|
|
) -> None:
|
2021-02-22 19:51:32 +03:00
|
|
|
browser_id = self.browser_id
|
|
|
|
visit_id = self.visit_id
|
2021-01-09 13:15:01 +03:00
|
|
|
link_urls = [
|
|
|
|
x
|
|
|
|
for x in (
|
|
|
|
element.get_attribute("href")
|
|
|
|
for element in webdriver.find_elements_by_tag_name("a")
|
|
|
|
)
|
|
|
|
if x.startswith(self.scheme + "://")
|
|
|
|
]
|
|
|
|
current_url = webdriver.current_url
|
|
|
|
|
|
|
|
sock = ClientSocket()
|
2021-02-22 19:51:32 +03:00
|
|
|
assert manager_params.storage_controller_address is not None
|
|
|
|
sock.connect(*manager_params.storage_controller_address)
|
2021-01-09 13:15:01 +03:00
|
|
|
|
|
|
|
for link in link_urls:
|
|
|
|
query = (
|
|
|
|
self.table_name,
|
|
|
|
{
|
|
|
|
"top_url": current_url,
|
|
|
|
"link": link,
|
2021-02-22 19:51:32 +03:00
|
|
|
"visit_id": visit_id,
|
|
|
|
"browser_id": browser_id,
|
2021-01-09 13:15:01 +03:00
|
|
|
},
|
|
|
|
)
|
|
|
|
sock.send(query)
|
|
|
|
sock.close()
|
|
|
|
|
|
|
|
|
2021-02-22 19:51:32 +03:00
|
|
|
def test_custom_function(default_params, xpi, server):
|
2021-05-10 19:48:47 +03:00
|
|
|
"""Test `custom_function` with an inline func that collects links"""
|
2021-02-22 19:51:32 +03:00
|
|
|
table_name = TableName("page_links")
|
|
|
|
|
|
|
|
manager_params, browser_params = default_params
|
|
|
|
path = manager_params.data_directory / "crawl-data.sqlite"
|
|
|
|
db = sqlite3.connect(path)
|
|
|
|
cur = db.cursor()
|
|
|
|
|
|
|
|
cur.execute(
|
|
|
|
"""CREATE TABLE IF NOT EXISTS %s (
|
|
|
|
top_url TEXT, link TEXT,
|
|
|
|
visit_id INTEGER, browser_id INTEGER);"""
|
|
|
|
% table_name
|
|
|
|
)
|
|
|
|
cur.close()
|
|
|
|
db.close()
|
|
|
|
|
|
|
|
storage_provider = SQLiteStorageProvider(path)
|
|
|
|
manager = TaskManager(manager_params, browser_params, storage_provider, None)
|
|
|
|
cs = command_sequence.CommandSequence(url_a)
|
|
|
|
cs.get(sleep=0, timeout=60)
|
|
|
|
cs.append_command(CollectLinksCommand(table_name, "http"))
|
|
|
|
manager.execute_command_sequence(cs)
|
|
|
|
manager.close()
|
|
|
|
query_result = db_utils.query_db(
|
|
|
|
path,
|
|
|
|
"SELECT top_url, link FROM page_links;",
|
|
|
|
as_tuple=True,
|
|
|
|
)
|
|
|
|
assert PAGE_LINKS == set(query_result)
|