Simplify usage to container and jupyter notebook implementation (#2)

* Add run shell script for linux vm * Add dpne.zip to ease running experiment out of the box * Remove run.sh, replace with jupyter notebook * add run.cmd code to notebook * uncomment lines * uncomment lines * Parse results into dataframe * Update README.md Add details on how to run on pyspark container * Update README.md update shrike version to the one that works with container * Update README.md * Update README.md added docker cp command * Update README.md nltk.download('punkt_tab') * Remove persist flags since they cause errors running local cluster * Update comments * Add cell to replicate reddit experiment * Add logging to reddit conversion script * Update README.md Add instructions to replicate results from paper in notebook. * Add note about perist-flags in DPNE step
2024-09-27 16:15:04 -07:00 · 2024-09-27 16:15:04 -07:00 · ddcac1043f
--- a/.gitignore
+++ b/.gitignore
@ -28,7 +28,8 @@ share/python-wheels/
 MANIFEST

 # ignore output folder
-output
+output*
+*.json

 # PyInstaller
 #  Usually these files are written by a python script from a template
--- a/Experiments.ipynb
+++ b/Experiments.ipynb
@ -0,0 +1,138 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fadf4e17-726f-461f-9448-1e45a712830a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Run this section if you want to replicate results from the paper for the reddit case\n",
+    "## WARNING: These are large files and so this step can take some time.\n",
+    "## If you want to run experiments on your own dataset, skip this cell and move to the next one\n",
+    "\n",
+    "# First, download the reddit file\n",
+    "! curl -L -O https://zenodo.org/record/1043504/files/corpus-webis-tldr-17.zip\n",
+    "\n",
+    "# Unzip it\n",
+    "! unzip corpus-webis-tldr-17.zip\n",
+    "\n",
+    "# Convert it to the required format\n",
+    "! python scripts/convert_reddit.py --input_path corpus-webis-tldr-17.json --output_path reddit.json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "91d81959-70c6-4327-98e6-12b9c71fb1ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# location of source file to extract DPNE from\n",
+    "# MAKE SURE TO UPDATE THIS TO POINT TO YOUR DESIRED DATASET IF YOU DIDN'T RUN THE PREVIOUS CELL\n",
+    "SOURCE_DATASET=\"./reddit.json\"\n",
+    "\n",
+    "# output folder high level\n",
+    "OUTPUT_FOLDER=\"output\"\n",
+    "\n",
+    "# file extension - json or anything else.\n",
+    "FILE_EXTENSION=\"json\"\n",
+    "\n",
+    "# epsilon for DP\n",
+    "DP_EPSILON=\"4.0\"\n",
+    "\n",
+    "# the highest N in n-grams to DP-extract\n",
+    "NGRAM_SIZE_LIMIT=\"10\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "92c2f441-1e72-431a-be2d-ee77d4b912fb",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# run shell scripts to have an easy interface to experiment and get started with DPNE\n",
+    "# This script runs a series of tokenization, ngram extraction, and DP N-gram extraction using the parameters specified as arguments\n",
+    "## NOTE: the --persist-flags argument for the extract DPNE step (third line below) was set to 00 for local running, but you may want to change it back to the default 11 value to persist intermediate results!\n",
+    "\n",
+    "! spark-submit dpne/tokenize_text.py -f json --ngrams {NGRAM_SIZE_LIMIT} --max_num_tokens 400 --allow_multiple_ngrams 1 -i {SOURCE_DATASET} -o ./{OUTPUT_FOLDER}/tokenize_text -t {FILE_EXTENSION}\n",
+    "\n",
+    "! spark-submit dpne/split_ngrams.py --ngram_size {NGRAM_SIZE_LIMIT} -i ./$OUTPUT_FOLDER/tokenize_text -o ./{OUTPUT_FOLDER}/split_ngrams -f {FILE_EXTENSION} -t {FILE_EXTENSION}\n",
+    "\n",
+    "! spark-submit dpne/extract_dpne.py --dp_epsilon {DP_EPSILON} --dp_eta 0.1 --dp_delta 0.5 --contribution_limit 10 --persist_flags 00 --log_flags 00 --top_k 1 --delta_user_count 0 --ngram_size {NGRAM_SIZE_LIMIT} --filter_one_side 0 --budget_distribute 10.0 --estimate_sample_size 0.8 -i ./{OUTPUT_FOLDER}/split_ngrams -o ./{OUTPUT_FOLDER}/dpne_sample -f {FILE_EXTENSION} -t {FILE_EXTENSION}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "f868084b-5780-493f-8137-359221f1f05f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Stats on 1gram\n",
+      "Stats on 2gram\n",
+      "Stats on 3gram\n",
+      "Stats on 4gram\n"
+     ]
+    }
+   ],
+   "source": [
+    "# analyze and plot the resultant data\n",
+    "import os, sys, pandas as pd\n",
+    "\n",
+    "try:\n",
+    "    ngrams_folder = os.listdir(\"./{OUTPUT_FOLDER}/dpne_sample\".format(OUTPUT_FOLDER=OUTPUT_FOLDER))\n",
+    "except:\n",
+    "    print(\"Something went wrong in writing the ngrams in the previous step. Please double check\")\n",
+    "\n",
+    "DPNGRAMS = {} # will map string \"Ngram\" => pandas DataFrame containing those N-grams\n",
+    "ngrams_folder.sort()\n",
+    "for ngram in ngrams_folder:\n",
+    "    # print stats of each ngram discovered\n",
+    "    print(\"Stats on\", ngram)\n",
+    "    for partfile in os.listdir(\"./{OUTPUT_FOLDER}/dpne_sample/{ngram}\".format(OUTPUT_FOLDER=OUTPUT_FOLDER, ngram=ngram)):\n",
+    "        partfile_split = partfile.split(\".\")\n",
+    "        if (len(partfile_split) == 2 and partfile_split[1] == \"json\"):\n",
+    "            with open(os.path.join(\"./{OUTPUT_FOLDER}/dpne_sample/{ngram}\".format(OUTPUT_FOLDER=OUTPUT_FOLDER, ngram=ngram), partfile), 'r') as f:\n",
+    "                DPNGRAMS[ngram] = pd.read_json(f, orient='records', lines=True)\n",
+    "                display(DPNGRAMS[ngram])\n",
+    "# Now you can use the appropriate dataframe for further investigation    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6fc79470-7ddd-4ba2-8b69-459b21869267",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/README.md
+++ b/README.md
@ -14,6 +14,7 @@ The code repository structure is as follows:
 - scripts: has scripts to run the code
  - convert_msnbc.py: converts MSNBC data
  - convert_reddit.py: converts Reddit data
+- DPNE Experiments.ipynb: Jupyter notebook to run on pyspark container for experimentation, which also runs the scripts in `run.cmd` (see below for instructions on how to run this)

 ## Prerequisites

@ -24,32 +25,61 @@ The code requires following libraries installed:
 - PySpark == 2.3
 - shrike

-## How to run
+## Preparing container to run experiments
+Running this code on a container makes getting started fairly easy and reliably. Follow these steps to get this running on a local container:
+- Make sure you have [docker installed](https://docs.docker.com/engine/install/) and running
+- Run `docker pull jupyter/pyspark-notebook` to install the pyspark-jupyter container
+- Run the container mapping port 8888 locally so you can run the notebook on your machine, using `docker run -p 8888:8888 --name jupyter-pyspark jupyter/pyspark-notebook` - from the logs that open up, paste the command into your browser to run the notebook - something like `http://127.0.0.1:8888/lab?token=<TOKEN>`
+- Bash into the container by running `docker exec -it jupyter-pyspark bash`
+- Run `git clone https://github.com/microsoft/differentially-private-ngram-extraction.git` to pull this repo into the container
+- Install the required libraries as mentioned above:
+  ```
+  pip install nltk
+  pip install pyspark
+  pip install shrike==1.31.18
+  ```
+  Additionally, run a python shell and run the following commands:
+  ```
+  import nltk
+  nltk.download('punkt_tab')
+  ```
+Now at this point, you can replicate results from the paper or run DP N-grams extraction on your own dataset. See instructions for each case below:

-### Prepare data
-First, download the data from below:
+### Replicate results from the paper
+There are two data sources cited:
 - MSNBC: https://archive.ics.uci.edu/ml/datasets/msnbc.com+anonymous+web+data
 - Reddit: https://github.com/webis-de/webis-tldr-17-corpus, 
 downloadable from https://zenodo.org/record/1043504/files/corpus-webis-tldr-17.zip
-
-Then run the convert scripts from DPNE home directory,
+To prepare data from these in the right format, the following scripts from DPNE home directory are used. 
 ```
 python scripts/convert_msnbc.py --input_path [Input file path which has the downloaded file] --output_path [output directory, like /output]
 python scripts/convert_reddit.py --input_path [Input file path which has the downloaded file] --output_path [output directory, like /output]
 ```
+This is simplified within the attached notebook, where you can simply follow these steps to run this:
+ - With the container running, navigate to the [notebook](http://127.0.0.1:8888/lab/tree/differentially-private-ngram-extraction/DPNE%20Experiments.ipynb) and run the code starting from the first cell which downloads and prepares data (for the reddit case).
+ - You may also change the default values of the variables `DP_EPSILON` and `NGRAM_SIZE_LIMIT` based on your needs. Run the commands in the cells which should eventually provide you with the extracted DP n-grams in the `DPNGRAMS` dictionary - `DPNGRAMS["1gram"]` will be a pandas dataframe with the extracted DP 1-grams and so on.
+ - Follow the steps in the subsequent cells, which break up the tokenization, splitting of n-grams and then DP n-grams extraction into separate spark sessions, and cache the results locally.
+ - Once these scripts have successfully run, the 3rd cell allows reads them into a dictionary of pandas dataframes, from where you may access the extracted DP n-grams.

-### Run DPNE

-1. Archive the dpne directory to dpne.zip, this is needed for PySpark to use the package of the whole python scripts
-2. Use run.cmd, you will need to modify the first line of DATA_HOME where your converted data exists. Simply you can run it below from DPNE home directory,
+### Run on your own dataset
+- Copy over into the differentially-private-ngram-extraction folder a dataset as a newline delimited JSON file with keys "author" and "content" representing the distinct author name/id, and their content you want to extract DP n-grams from, respectively. On another terminal you can use the command `docker cp /path/to/file.json jupyter-pyspark:/home/jovyan/differentially-private-ngram-extraction/`
+- Now you can simply navigate to the [notebook](http://127.0.0.1:8888/lab/tree/differentially-private-ngram-extraction/DPNE%20Experiments.ipynb) and run the code, changing `SOURCE_DATASET` to the name of the JSON file you just copied. If you are using something other than JSON, please change `FILE_EXTENSION` accordingly. You may also change the default values of the variables `DP_EPSILON` and `NGRAM_SIZE_LIMIT` based on your needs. Run the commands in the cells which should eventually provide you with the extracted DP n-grams in the `DPNGRAMS` dictionary - `DPNGRAMS["1gram"]` will be a pandas dataframe with the extracted DP 1-grams and so on.
+- Follow the steps in the subsequent cells, which break up the tokenization, splitting of n-grams and then DP n-grams extraction into separate spark sessions, and cache the results locally.
+- Once these scripts have successfully run, the 3rd cell allows reads them into a dictionary of pandas dataframes, from where you may access the extracted DP n-grams.
+
+### Run DPNE without the container
+If you choose to run this within the shell or with local modifications without using the container method described above, simply follow these steps
+1. If you made changes to any file in the dpne/ folder, re-archive the dpne directory to dpne.zip, this is needed for PySpark to use the package of the whole python scripts.
+2. Assuming you are on a windows machine, use run.cmd, you will need to modify the first line of DATA_HOME where your converted data exists. Simply you can run it below from DPNE home directory,
 ```
 .\scripts\run.cmd
 ```
+If you are on a Linux based environment, see the corresponding shell scrips in the notebook.

 ## References
 [1] Kunho Kim, Sivakanth Gopi, Janardhan Kulkarni, Sergey Yekhanin. Differentially Private n-gram Extraction. In Proceedings of the Thirty-fifth Conference on Neural Information Processing Systems (NeurIPS), 2021.

-
 ## Contributing

 This project welcomes contributions and suggestions.  Most contributions require you to agree to a
--- a/dpne.zip
+++ b/dpne.zip
--- a/scripts/convert_reddit.py
+++ b/scripts/convert_reddit.py
@ -7,7 +7,9 @@ to the json format we use for the DPNE code
 import argparse
 import json
 import os
-
+import logging
+from dpne.dpne_utils import log
+from shrike.compliant_logging import DataCategory, enable_compliant_logging

 def convert_reddit(file_reader, file_writer, field):
    for line in file_reader:
@ -62,6 +64,7 @@ def get_arg_parser(parser=None):
    return parser

 if __name__ == '__main__':
+    enable_compliant_logging()
    parser = get_arg_parser()
    args = parser.parse_args()

@ -71,4 +74,6 @@ if __name__ == '__main__':
    output_path = os.path.join(args.output_path, "reddit.json")

    with open(input_path, 'r', encoding='utf-8') as file_reader, open(output_path, 'w', encoding='utf-8') as file_writer:
+        log(logging.INFO,DataCategory.PUBLIC, "Initiating conversion of file..")
        convert_reddit(file_reader, file_writer, args.field)
+        log(logging.INFO,DataCategory.PUBLIC, "Conversion complete.")