Merge branch 'master' of github.com:citp/OpenWPM

2016-10-19 16:05:55 -04:00 · 2016-10-19 16:05:55 -04:00 · 3a14416c57
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-OpenWPM [![Build Status](https://travis-ci.org/citp/OpenWPM.svg)](https://travis-ci.org/citp/OpenWPM)
+OpenWPM [![Build Status](https://travis-ci.org/citp/OpenWPM.svg?branch=master)](https://travis-ci.org/citp/OpenWPM)
 =======

 OpenWPM is a web privacy measurement framework which makes it easy to collect
@ -24,7 +24,7 @@ us know!
 Quick Start
 -----------

-Once installed, it's very easy to run a quick test of OpenWPM. Check out
+Once installed, it is very easy to run a quick test of OpenWPM. Check out
 `demo.py` for an example. This will the default setting specified in
 `automation/default_manager_params.json` and
 `automation/default_browser_params.json`, with the exception of the changes
--- a/automation/CommandSequence.py
+++ b/automation/CommandSequence.py
@ -38,7 +38,8 @@ class CommandSequence:
        self.contains_get_or_browse = True

    def dump_flash_cookies(self, timeout=60):
-        """ dumps the local storage vectors (flash, localStorage, cookies) to db """
+        """ dumps the local storage vectors (flash, localStorage, cookies) to db
+        Side effect: closes the current tab."""
        self.total_timeout += timeout
        if not self.contains_get_or_browse:
            raise CommandExecutionError("No get or browse request preceding "
@ -47,7 +48,8 @@ class CommandSequence:
        self.commands_with_timeout.append((command, timeout))

    def dump_profile_cookies(self, timeout=60):
-        """ dumps from the profile path to a given file (absolute path) """
+        """ dumps from the profile path to a given file (absolute path)
+        Side effect: closes the current tab."""
        self.total_timeout += timeout
        if not self.contains_get_or_browse:
            raise CommandExecutionError("No get or browse request preceding "
@ -69,3 +71,23 @@ class CommandSequence:
                                        "the dump storage vectors command", self)
        command = ('EXTRACT_LINKS',)
        self.commands_with_timeout.append((command, timeout))
+
+    def save_screenshot(self, screenshot_name, timeout=30):
+        """Saves screenshot of page to 'screenshots' directory in data directory."""
+        self.total_timeout += timeout
+        if not self.contains_get_or_browse:
+            raise CommandExecutionError("No get or browse request preceding "
+                                        "the save screenshot command", self)
+        command = ('SAVE_SCREENSHOT', screenshot_name,)
+        self.commands_with_timeout.append((command, timeout))
+
+    def dump_page_source(self, dump_name, timeout=30):
+        """Dumps rendered source of current page visit to 'sources' directory."""
+        self.total_timeout += timeout
+        if not self.contains_get_or_browse:
+            raise CommandExecutionError("No get or browse request preceding "
+                                        "the dump page source command", self)
+        command = ('DUMP_PAGE_SOURCE', dump_name,)
+        self.commands_with_timeout.append((command, timeout))
+
+
--- a/automation/Commands/browser_commands.py
+++ b/automation/Commands/browser_commands.py
@ -4,6 +4,7 @@ from selenium.webdriver.support import expected_conditions as EC
 from selenium.common.exceptions import MoveTargetOutOfBoundsException
 from selenium.common.exceptions import TimeoutException
 from selenium.webdriver.common.action_chains import ActionChains
+import os
 import random
 import time

@ -225,3 +226,10 @@ def dump_profile_cookies(start_time, visit_id, webdriver, browser_params, manage

    # Close connection to db
    sock.close()
+
+def save_screenshot(screenshot_name, webdriver, browser_params, manager_params):
+    webdriver.save_screenshot(os.path.join(manager_params['screenshot_path'], screenshot_name + '.png'))
+
+def dump_page_source(dump_name, webdriver, browser_params, manager_params):
+    with open(os.path.join(manager_params['source_dump_path'], dump_name + '.html'), 'wb') as f:
+        f.write(webdriver.page_source.encode('utf8') + '\n')
--- a/automation/Commands/command_executor.py
+++ b/automation/Commands/command_executor.py
@ -40,3 +40,11 @@ def execute_command(command, webdriver, proxy_queue, browser_settings, browser_p

    if command[0] == 'EXTRACT_LINKS':
        browser_commands.extract_links(webdriver, browser_params, manager_params)
+
+    if command[0] == 'SAVE_SCREENSHOT':
+        browser_commands.save_screenshot(screenshot_name=command[1], webdriver=webdriver,
+                                         browser_params=browser_params, manager_params=manager_params)
+
+    if command[0] == 'DUMP_PAGE_SOURCE':
+        browser_commands.dump_page_source(dump_name=command[1], webdriver=webdriver,
+                                          browser_params=browser_params, manager_params=manager_params)
--- a/automation/TaskManager.py
+++ b/automation/TaskManager.py
@ -60,8 +60,16 @@ class TaskManager:
                manager_params[path] = os.path.expanduser(manager_params[path])
        manager_params['database_name'] = os.path.join(manager_params['data_directory'],manager_params['database_name'])
        manager_params['log_file'] = os.path.join(manager_params['log_directory'],manager_params['log_file'])
+        manager_params['screenshot_path'] = os.path.join(manager_params['data_directory'], 'screenshots')
+        manager_params['source_dump_path'] = os.path.join(manager_params['data_directory'], 'sources')
        self.manager_params = manager_params

+        # Create data directories if they do not exist
+        if not os.path.exists(manager_params['screenshot_path']):
+            os.makedirs(manager_params['screenshot_path'])
+        if not os.path.exists(manager_params['source_dump_path']):
+            os.makedirs(manager_params['source_dump_path'])
+
        # check size of parameter dictionary
        self.num_browsers = manager_params['num_browsers']
        if len(browser_params) != self.num_browsers:
--- a/install.sh
+++ b/install.sh
@ -17,7 +17,11 @@ fi

 sudo apt-get update

-sudo apt-get install -y firefox htop git python-dev libxml2-dev libxslt-dev libffi-dev libssl-dev build-essential xvfb libboost-python-dev libleveldb1 libleveldb-dev libjpeg-dev
+sudo apt-get install -y firefox htop git python-dev libxml2-dev libxslt-dev libffi-dev libssl-dev build-essential xvfb libboost-python-dev libleveldb-dev libjpeg-dev
+
+# For some versions of ubuntu, the package libleveldb1v5 isn't available. Use libleveldb1 instead.
+sudo apt-get install -y libleveldb1v5 || sudo apt-get install -y libleveldb1
+
 if [ "$flash" = true ]; then
    sudo apt-get install -y adobe-flashplugin
 fi
@ -34,5 +38,6 @@ fi
 # Install specific version of Firefox known to work well with the selenium version above
 wget https://ftp.mozilla.org/pub/firefox/releases/45.0.1/linux-x86_64/en-US/firefox-45.0.1.tar.bz2
 tar jxf firefox*.tar.bz2
+rm -rf firefox-bin
 mv firefox firefox-bin
 rm firefox*.tar.bz2
--- a/test/expected.py
+++ b/test/expected.py
@ -125,22 +125,22 @@ set_property = [(SET_PROP_TEST_PAGE,

 # AudioContext and AudioNode symbols we expect from our test script
 audio = {
-    "AudioContext.createOscillator",
-    "AudioContext.createAnalyser",
-    "AudioContext.createGain",
-    "AudioContext.createScriptProcessor",
-    "GainNode.gain",
-    "OscillatorNode.type",
-    "OscillatorNode.connect",
-    "AnalyserNode.connect",
-    "ScriptProcessorNode.connect",
-    "AudioContext.destination",
-    "GainNode.connect",
-    "ScriptProcessorNode.onaudioprocess",
-    "OscillatorNode.start",
-    "AnalyserNode.frequencyBinCount",
-    "AnalyserNode.getFloatFrequencyData",
-    "AnalyserNode.disconnect",
-    "ScriptProcessorNode.disconnect",
-    "GainNode.disconnect",
-    "OscillatorNode.stop"}
+    u"AudioContext.createOscillator",
+    u"AudioContext.createAnalyser",
+    u"AudioContext.createGain",
+    u"AudioContext.createScriptProcessor",
+    u"GainNode.gain",
+    u"OscillatorNode.type",
+    u"OscillatorNode.connect",
+    u"AnalyserNode.connect",
+    u"ScriptProcessorNode.connect",
+    u"AudioContext.destination",
+    u"GainNode.connect",
+    u"ScriptProcessorNode.onaudioprocess",
+    u"OscillatorNode.start",
+    u"AnalyserNode.frequencyBinCount",
+    u"AnalyserNode.getFloatFrequencyData",
+    u"AnalyserNode.disconnect",
+    u"ScriptProcessorNode.disconnect",
+    u"GainNode.disconnect",
+    u"OscillatorNode.stop"}
--- a/test/test_extension.py
+++ b/test/test_extension.py
@ -96,6 +96,7 @@ class TestExtension(OpenWPMTest):
                observed_rows.add(item)
        assert set(expected.webrtc_calls) == observed_rows

+    @pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason='Flaky on Travis CI')
    def test_audio_fingerprinting(self, tmpdir):
        db = self.visit('/audio_fingerprinting.html', str(tmpdir))
        # Check that all calls and methods are recorded
--- a/test/test_pages/expected_source.html
+++ b/test/test_pages/expected_source.html
@ -0,0 +1,18 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml"><head>
+<title>Simple Page A</title>
+  <script type="application/javascript">
+    function set_cookie() {
+        document.cookie = 'test_cookie=Test-Page-A; expires=Tue, 31 Dec 2030 00:00:00 UTC; path=/';
+        console.log(window.navigator.userAgent);
+     }
+  </script>
+ </head>
+ <body onload="set_cookie()">
+ <a href="http://localtest.me:8000/test_pages/simple_c.html">Click me!</a>
+ <a href="simple_d.html">Click me also!</a>
+ <a href="javascript:alert(1)">Click me for a JS alert!</a>
+ <a href="https://www.google.com">Go to google.com</a>
+ <a href="http://example.com/test.html?localtest.me">Go to example.com</a>
+ 
+</body></html>
--- a/test/test_simple_commands.py
+++ b/test/test_simple_commands.py
@ -1,4 +1,7 @@
 import pytest # noqa
+
+from PIL import Image
+import filecmp
 import os
 import utilities
 from ..automation import CommandSequence
@ -9,6 +12,8 @@ url_b = utilities.BASE_TEST_URL + '/simple_b.html'
 url_c = utilities.BASE_TEST_URL + '/simple_c.html'
 url_d = utilities.BASE_TEST_URL + '/simple_d.html'

+rendered_js_url = utilities.BASE_TEST_URL + '/property_enumeration.html'
+
 class TestSimpleCommands():
    """Test correctness of simple commands and check
    that resulting data is properly keyed.
@ -191,3 +196,41 @@ class TestSimpleCommands():
        assert qry_res[0][0] == 4


+    def test_save_screenshot_valid(self, tmpdir):
+        """Check that 'save_screenshot' works and screenshot is created properly."""
+        # Run the test crawl
+        manager_params, browser_params = self.get_config(str(tmpdir))
+        manager = TaskManager.TaskManager(manager_params, browser_params)
+        cs = CommandSequence.CommandSequence(url_a)
+        cs.get(sleep=1)
+        cs.save_screenshot('test_screenshot')
+        manager.execute_command_sequence(cs)
+        manager.close(post_process=False)
+
+
+        # Check that image is not blank
+        im = Image.open(os.path.join(str(tmpdir), 'screenshots', 'test_screenshot.png'))
+        bands = im.split()
+
+        isBlank = all(band.getextrema() == (255, 255) for band in bands)
+
+        assert not isBlank
+
+
+    def test_dump_page_source_valid(self, tmpdir):
+        """Check that 'dump_page_source' works and source is saved properly."""
+        # Run the test crawl
+        manager_params, browser_params = self.get_config(str(tmpdir))
+        manager = TaskManager.TaskManager(manager_params, browser_params)
+        cs = CommandSequence.CommandSequence(url_a)
+        cs.get(sleep=1)
+        cs.dump_page_source('test_source')
+        manager.execute_command_sequence(cs)
+        manager.close(post_process=False)
+
+        with open(os.path.join(str(tmpdir), 'sources', 'test_source.html'), 'rb') as f:
+            actual_source = f.read()
+        with open('./test_pages/expected_source.html', 'rb') as f:
+            expected_source = f.read()
+
+        assert actual_source == expected_source