зеркало из https://github.com/openwpm/OpenWPM.git
Updated docs and codebase in general to use save_content instead of save_all_content and save_javascript
This commit is contained in:
Родитель
0ff197501b
Коммит
2e99d250a6
14
README.md
14
README.md
|
@ -96,13 +96,16 @@ available [below](#output-format).
|
|||
* Response body content
|
||||
* Saves all files encountered during the crawl to a `LevelDB`
|
||||
database de-duplicated by the md5 hash of the content.
|
||||
* Set `browser_params['save_all_content'] = True`
|
||||
* Set `browser_params['save_content'] = True`
|
||||
* The `content_hash` column of the `http_responses` table contains the md5
|
||||
hash for each script, and can be used to do content lookups in the
|
||||
LevelDB content database.
|
||||
* NOTE: this instrumentation may lead to performance issues when a large
|
||||
number of browsers are in use.
|
||||
* Set `browser_params['save_javascript'] = True` to save only Javascript
|
||||
* Set `browser_params['save_content']` to a comma-separated list of
|
||||
[resource_types](https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/API/webRequest/ResourceType)
|
||||
to save only specific types of files, for instance
|
||||
`browser_params['save_content'] = "script"` to save only Javascript
|
||||
files. This will lessen the performance impact of this instrumentation
|
||||
when a large number of browsers are used in parallel.
|
||||
* Flash Cookies
|
||||
|
@ -190,10 +193,9 @@ inline by sending a `create_table` message to the data aggregator.
|
|||
#### Parquet on Amazon S3 **Experimental**
|
||||
As an option, OpenWPM can save data directly to an Amazon S3 bucket as a
|
||||
Parquet Dataset. This is currently experimental and hasn't been thoroughly
|
||||
tested. Response body content (both `save_javascript` and `save_all_content`),
|
||||
screenshots, and page source saving is not currently supported and will still
|
||||
be stored in local databases and directories. To enable S3 saving specify the
|
||||
following configuration parameters in `manager_params`:
|
||||
tested. Screenshots, and page source saving is not currently supported and
|
||||
will still be stored in local databases and directories. To enable S3
|
||||
saving specify the following configuration parameters in `manager_params`:
|
||||
* Output format: `manager_params['output_format'] = 's3'`
|
||||
* S3 bucket name: `manager_params['s3_bucket'] = 'openwpm-test-crawl'`
|
||||
* Directory within S3 bucket: `manager_params['s3_directory'] = '2018-09-09_test-crawl-new'`
|
||||
|
|
|
@ -196,7 +196,7 @@ class LocalAggregator(BaseAggregator):
|
|||
# (if content saving is enabled on any browser)
|
||||
self.ldb_enabled = False
|
||||
for params in browser_params:
|
||||
if params['save_javascript'] or params['save_all_content']:
|
||||
if params['save_content']:
|
||||
self.ldb_enabled = True
|
||||
break
|
||||
|
||||
|
|
|
@ -4,8 +4,7 @@
|
|||
"js_instrument": false,
|
||||
"http_instrument": false,
|
||||
"navigation_instrument": false,
|
||||
"save_javascript": false,
|
||||
"save_all_content": false,
|
||||
"save_content": false,
|
||||
|
||||
"random_attributes": false,
|
||||
"bot_mitigation": false,
|
||||
|
|
|
@ -425,7 +425,7 @@ class TestHTTPInstrument(OpenWPMTest):
|
|||
def get_config(self, data_dir=""):
|
||||
manager_params, browser_params = self.get_test_config(data_dir)
|
||||
browser_params[0]['http_instrument'] = True
|
||||
browser_params[0]['save_javascript'] = True
|
||||
browser_params[0]['save_content'] = "script"
|
||||
return manager_params, browser_params
|
||||
|
||||
def test_page_visit(self):
|
||||
|
@ -602,7 +602,7 @@ class TestHTTPInstrument(OpenWPMTest):
|
|||
test_url = utilities.BASE_TEST_URL + '/http_test_page.html'
|
||||
manager_params, browser_params = self.get_test_config(str(tmpdir))
|
||||
browser_params[0]['http_instrument'] = True
|
||||
browser_params[0]['save_all_content'] = True
|
||||
browser_params[0]['save_content'] = True
|
||||
manager = TaskManager.TaskManager(manager_params, browser_params)
|
||||
manager.get(url=test_url, sleep=1)
|
||||
manager.close()
|
||||
|
|
|
@ -66,7 +66,7 @@ class TestProfile(OpenWPMTest):
|
|||
def test_profile_saved_when_launch_crashes(self):
|
||||
manager_params, browser_params = self.get_config()
|
||||
browser_params[0]['proxy'] = True
|
||||
browser_params[0]['save_javascript'] = True
|
||||
browser_params[0]['save_content'] = "script"
|
||||
manager = TaskManager.TaskManager(manager_params, browser_params)
|
||||
manager.get('http://example.com')
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче