[simulation] Fake News Perceptron (#37)

Load fake news data from https://www.kaggle.com/c/fake-news/data
2019-10-10 15:58:50 -04:00 · 2019-10-10 15:58:50 -04:00 · 6a98ec6ba5
--- a/simulation/Dockerfile
+++ b/simulation/Dockerfile
@ -9,15 +9,14 @@ EXPOSE 5006

 ENV LC_ALL C.UTF-8

-RUN conda create --channel conda-forge --name decai-simulation --yes python=3.7 bokeh ipython mkl mkl-service numpy phantomjs scikit-learn scipy six tensorflow
+RUN conda create --channel conda-forge --name decai-simulation --yes python=3.7 bokeh ipython mkl mkl-service numpy pandas phantomjs scikit-learn scipy six tensorflow

 RUN conda init bash
+RUN echo "conda activate decai-simulation" >> ~/.bashrc

 WORKDIR /root/workspace/0xDeCA10B/simulation
 COPY setup.py .

 RUN conda run --name decai-simulation pip install -e .[test]

-RUN echo "conda activate decai-simulation" >> ~/.bashrc
-
 CMD ["bash"]
--- a/simulation/README.md
+++ b/simulation/README.md
@ -30,7 +30,7 @@ Despite the malicious efforts, the accuracy can still be maintained and the hone
 This section explains how to set up locally, alternatively, you can skip ahead and use a Docker image.
 Run:
 ```bash
-conda create --channel conda-forge --name decai-simulation python=3.7 bokeh ipython mkl mkl-service numpy phantomjs scikit-learn scipy six tensorflow
+conda create --channel conda-forge --name decai-simulation python=3.7 bokeh ipython mkl mkl-service numpy pandas phantomjs scikit-learn scipy six tensorflow
 conda activate decai-simulation
 pip install -e .
 ```
--- a/simulation/decai/simulation/contract/classification/classifier.py
+++ b/simulation/decai/simulation/contract/classification/classifier.py
@ -1,3 +1,4 @@
+import logging
 from abc import ABC, abstractmethod

 from decai.simulation.contract.objects import SmartContract
@ -13,12 +14,24 @@ class Classifier(ABC, SmartContract):
        """
        Evaluate the model.

-        :param data: Many data samples.
+        :param data: Data samples.
        :param labels: The ground truth labels for `data`.
        :return: The accuracy for the given test set.
        """
        pass

+    @abstractmethod
+    def log_evaluation_details(self, data, labels, level=logging.INFO) -> float:
+        """
+        Log some evaluation details.
+
+        :param data: Data samples.
+        :param labels: The ground truth labels for `data`.
+        :param level: The level at which to log.
+        :return: The accuracy for the given test set.
+        """
+        pass
+
    @abstractmethod
    def init_model(self, training_data, labels):
        """
--- a/simulation/decai/simulation/contract/classification/scikit_classifier.py
+++ b/simulation/decai/simulation/contract/classification/scikit_classifier.py
@ -1,3 +1,4 @@
+import logging
 import os
 import time
 from dataclasses import dataclass
@ -7,6 +8,7 @@ from typing import Any
 import joblib
 import numpy as np
 from injector import inject, Module, provider, ClassAssistedBuilder
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

 from decai.simulation.contract.classification.classifier import Classifier

@ -33,6 +35,23 @@ class SciKitClassifier(Classifier):
        self._logger.debug("Evaluating.")
        return self._model.score(data, labels)

+    def log_evaluation_details(self, data, labels, level=logging.INFO) -> float:
+        assert self._model is not None, "The model has not been initialized yet."
+        assert isinstance(data, np.ndarray), "The data must be an array."
+        assert isinstance(labels, np.ndarray), "The labels must be an array."
+        self._logger.debug("Evaluating.")
+        predicted_labels = self._model.predict(data)
+        result = accuracy_score(labels, predicted_labels)
+        if self._logger.isEnabledFor(level):
+            m = confusion_matrix(labels, predicted_labels)
+            report = classification_report(labels, predicted_labels)
+            self._logger.log(level,
+                             "Confusion matrix:\n%s"
+                             "\nReport:\n%s"
+                             "\nAccuracy: %0.2f%%",
+                             m, report, result * 100)
+        return result
+
    def init_model(self, training_data, labels):
        assert self._model is None, "The model has already been initialized."
        self._logger.debug("Initializing model.")
--- a/simulation/decai/simulation/data/news_data_loader.py
+++ b/simulation/decai/simulation/data/news_data_loader.py
@ -0,0 +1,266 @@
+import itertools
+import json
+import os
+import random
+from collections import Counter
+from dataclasses import dataclass
+from enum import Enum
+from logging import Logger
+from operator import itemgetter
+from typing import Optional, Collection, Tuple
+
+import numpy as np
+import pandas as pd
+import spacy
+from injector import ClassAssistedBuilder, inject, Module, provider, singleton
+from sklearn.feature_extraction.text import TfidfVectorizer
+from spacy.cli import download
+from tqdm import tqdm
+
+from .data_loader import DataLoader
+
+
+class Label(Enum):
+    RELIABLE = 0
+    UNRELIABLE = 1
+
+
+@dataclass
+class News:
+    text: Optional[str]
+    label: Label
+
+
+@inject
+@dataclass
+class _SignalMediaDataLoader(DataLoader):
+    """
+    INCOMPLETE BECAUSE MAPPING THE SOURCE NAMES TO DOMAIN NAMES IS TRICKY.
+    See https://github.com/aldengolab/fake-news-detection/issues/4
+
+    Following logic of https://github.com/aldengolab/fake-news-detection.
+    Requires the Signal Media dataset from http://research.signalmedia.co/newsir16/signal-dataset.html to be at
+    simulation/training_data/news/sample-1M.jsonl
+    and https://github.com/OpenSourcesGroup/opensources with sources.json in simulation/training_data/news/
+    """
+    _logger: Logger
+    _media_types = {'News'}
+
+    def find_source_site(self, source_name: str, sources: Collection[str]) -> Optional[str]:
+        """
+        :param source_name: The name of the source.
+        :param sources: Source domain names.
+        :return: The source domain name from `sources` or `None` if no mapping can be found.
+        """
+        # TODO
+        result = None
+        return result
+
+    def load_data(self, train_size: int = None, test_size: int = None) -> (tuple, tuple):
+        data_folder_path = os.path.join(__file__, '../../../../training_data/news')
+        signal_data_path = os.path.join(data_folder_path, 'sample-1M.jsonl')
+        if not os.path.exists(signal_data_path):
+            raise Exception(f"Could not find the Signal Media dataset at \"{signal_data_path}\"."
+                            "\nYou must obtain it from http://research.signalmedia.co/newsir16/signal-dataset.html"
+                            f" and follow the instructions to obtain it. Then extract it to \"{signal_data_path}\".")
+
+        sources_path = os.path.join(data_folder_path, 'sources.json')
+        if not os.path.exists(sources_path):
+            raise Exception(f"Could not find the sources dataset at \"{sources_path}\"."
+                            "\nYou must obtain it from https://github.com/OpenSourcesGroup/opensources and put"
+                            f" sources.json in \"{data_folder_path}\".")
+
+        with open(sources_path) as f:
+            loaded_sources = json.load(f)
+        sources = dict()
+        for source, info in loaded_sources.items():
+            problem_types = (info['type'], info['2nd type'], info['3rd type'])
+            sources[source] = set(filter(None, problem_types))
+        self._logger.info("Found %d sources with labels.", len(sources))
+
+        # Name: website name in `sources`.
+        source_mapping = {}
+        not_found_flag = -1
+        with open(signal_data_path) as f:
+            for index, line in tqdm(enumerate(f),
+                                    desc="Filtering news articles",
+                                    unit_scale=True, mininterval=2, unit=" articles"
+                                    ):
+                news = json.loads(line)
+                news_id = news['id']
+                title = news['title']
+                text = news['content']
+                source = news['source']
+                # media-type is either "News" or "Blog"
+                media_type = news['media-type']
+                published_date = news['published']
+                if media_type not in self._media_types:
+                    continue
+                source_site = source_mapping.get(source)
+                if source_site is None:
+                    source_site = self.find_source_site(source, sources)
+                    if source_site is not None:
+                        source_mapping[source] = source_site
+                    else:
+                        source_mapping[source] = not_found_flag
+                        continue
+                elif source_site == not_found_flag:
+                    continue
+                # TODO Use article and set label.
+
+        with open(os.path.join(data_folder_path, 'source_mapping.json')) as f:
+            sorted(source_mapping.items(), key=itemgetter(0))
+
+        self._logger.info("Found %d sources in the articles.", len(source_mapping))
+
+        # TODO Set up output.
+        (x_train, y_train), (x_test, y_test) = (None, None), (None, None)
+        if train_size is not None:
+            x_train, y_train = x_train[:train_size], y_train[:train_size]
+        if test_size is not None:
+            x_test, y_test = x_test[:test_size], y_test[:test_size]
+
+        self._logger.info("Done loading news data.")
+        return (x_train, y_train), (x_test, y_test)
+
+
+@inject
+@dataclass
+class NewsDataLoader(DataLoader):
+    """
+    Load data from news sources.
+
+    Requires data from https://www.kaggle.com/c/fake-news/data to be saved to "simulation/trainin_data/news/fake-news/train.csv".
+    """
+
+    _logger: Logger
+    _train_split = 0.7
+
+    _replace_entities_enabled = False
+    """
+    If True, entities will be replaced in text with the entity's label surrounded by angle brackets: "<LABEL>".
+    Accuracy with replacement: 0.9172
+    Accuracy without replacement: 0.9173
+
+    Disabled because using spaCy is slow, it will be tricky to use spaCy in JavaScript,
+    and it didn't change the evaluation metrics much.
+    """
+
+    _entity_types_to_replace = {'PERSON', 'GPE', 'ORG', 'DATE', 'TIME', 'PERCENT',
+                                'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL'}
+
+    def __post_init__(self):
+        spacy_model = 'en_core_web_lg'
+        download(spacy_model)
+        self._nlp = spacy.load(spacy_model, disable={'tagger', 'parser', 'textcat'})
+
+
+    def _load_kaggle_data(self, data_folder_path: str) -> Collection[News]:
+        """
+        Load data from https://www.kaggle.com/c/fake-news/data.
+        """
+        # Don't use the test data because it has no labels.
+        fake_news_data_path = os.path.join(data_folder_path, 'fake-news', 'train.csv')
+        if not os.path.exists(fake_news_data_path):
+            raise Exception(f"Could not find the Fake News dataset at \"{fake_news_data_path}\"."
+                            "\nYou must obtain it from https://www.kaggle.com/c/fake-news/data.")
+        data = pd.read_csv(fake_news_data_path, na_values=dict(text=[]), keep_default_na=False)
+        result = []
+        for row in data.itertuples():
+            label = Label.RELIABLE if row.label == 0 else Label.UNRELIABLE
+            if len(row.text) > 0:
+                result.append(News(row.text, label))
+
+        # Consistent shuffle to aim for a mostly even distribution of labels.
+        random.shuffle(result, lambda: 0.618)
+
+        return result
+
+    def _replace_entities(self, doc) -> str:
+        # Remove names in text using spaCy.
+        result = doc.text
+        for ent in doc.ents[::-1]:
+            if ent.label_ in self._entity_types_to_replace:
+                result = result[:ent.start_char] + "<" + ent.label_ + ">" + result[ent.end_char:]
+        return result
+
+    def _pre_process_text(self, doc) -> str:
+        # TODO Remove name of news sources.
+        if self._replace_entities_enabled:
+            result = self._replace_entities(doc)
+        else:
+            assert isinstance(doc, str)
+            result = doc
+        return result
+
+    def _pre_process(self, news_articles: Collection[News], train_size: int, test_size: int) -> \
+            Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]:
+        self._logger.info("Getting features for %d articles.", len(news_articles))
+        # Only use binary features.
+        ngram_range = (2, 2)
+        t = TfidfVectorizer(max_features=3000, ngram_range=ngram_range)
+        test_start = len(news_articles) - test_size
+
+        x_train = map(lambda news: news.text, itertools.islice(news_articles, train_size))
+        x_test = map(lambda news: news.text, itertools.islice(news_articles, test_start, len(news_articles)))
+        if self._replace_entities:
+            x_train = self._nlp.pipe(x_train, batch_size=128)
+            x_test = self._nlp.pipe(x_test, batch_size=128)
+
+        x_train = map(self._pre_process_text, x_train)
+        x_test = map(self._pre_process_text, x_test)
+
+        x_train = t.fit_transform(tqdm(x_train,
+                                       desc="Processing training data",
+                                       total=train_size,
+                                       unit_scale=True, mininterval=2,
+                                       unit=" articles"
+                                       )).toarray()
+        x_test = t.transform(tqdm(x_test,
+                                  desc="Processing testing data",
+                                  total=test_size,
+                                  unit_scale=True, mininterval=2,
+                                  unit=" articles"
+                                  )).toarray()
+
+        y_train = np.array([news.label.value for news in itertools.islice(news_articles, train_size)], np.int8)
+        y_test = np.array([news.label.value for news in itertools.islice(news_articles,
+                                                                         test_start, len(news_articles))], np.int8)
+        self._logger.debug("Training labels: %s", Counter(y_train))
+        self._logger.debug("Test labels: %s", Counter(y_test))
+        self._logger.info("Done getting features.")
+        return (x_train, y_train), (x_test, y_test)
+
+    def load_data(self, train_size: int = None, test_size: int = None) -> \
+            Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]:
+        self._logger.info("Loading news data.")
+        data_folder_path = os.path.join(__file__, '../../../../training_data/news')
+
+        data = self._load_kaggle_data(data_folder_path)
+
+        #  Separate train and test data.
+        if train_size is None:
+            if test_size is None:
+                train_size = int(self._train_split * len(data))
+            else:
+                train_size = len(data) - test_size
+        if test_size is None:
+            test_size = len(data) - train_size
+        if train_size + test_size > len(data):
+            raise Exception("There is not enough data for the requested sizes."
+                            f"\n  data size: {len(data)}"
+                            f"\n  train size: {train_size}"
+                            f"\n  test size: {test_size}")
+
+        (x_train, y_train), (x_test, y_test) = self._pre_process(data, train_size, test_size)
+        self._logger.info("Done loading news data.")
+        return (x_train, y_train), (x_test, y_test)
+
+
+@dataclass
+class NewsDataModule(Module):
+
+    @provider
+    @singleton
+    def provide_data_loader(self, builder: ClassAssistedBuilder[NewsDataLoader]) -> DataLoader:
+        return builder.build()
--- a/simulation/decai/simulation/data/tests/test_news_data_loader.py
+++ b/simulation/decai/simulation/data/tests/test_news_data_loader.py
@ -0,0 +1,30 @@
+import unittest
+from typing import cast
+
+from injector import Injector
+
+from decai.simulation.data.data_loader import DataLoader
+from decai.simulation.data.news_data_loader import NewsDataLoader, NewsDataModule
+from decai.simulation.logging_module import LoggingModule
+
+
+class TestNewsDataLoader(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        inj = Injector([
+            LoggingModule,
+            NewsDataModule,
+        ])
+
+        cls.data_loader = inj.get(DataLoader)
+        assert isinstance(cls.data_loader, NewsDataLoader)
+        cls.data_loader = cast(NewsDataLoader, cls.data_loader)
+
+    @unittest.skip("The dataset does not exist on CI test machine.")
+    def test_load_data(self):
+        (x_train, y_train), (x_test, y_test) = self.data_loader.load_data()
+
+    def test_entities(self):
+        doc = self.data_loader._nlp("Today, John Smith walked to a store and bought an apple.")
+        actual = self.data_loader._replace_entities(doc)
+        self.assertEqual("<DATE>, <PERSON> walked to a store and bought an apple.", actual)
--- a/simulation/decai/simulation/data/titanic_data_loader.py
+++ b/simulation/decai/simulation/data/titanic_data_loader.py
@ -105,7 +105,7 @@ class TitanicDataLoader(DataLoader):
        data_folder_path = os.path.join(__file__, '../../../../training_data/titanic')
        if not os.path.exists(data_folder_path):
            # TODO Attempt to download the data.
-            raise Exception(f"Could not find Titanic dataset at {data_folder_path}"
+            raise Exception(f"Could not find Titanic dataset at \"{data_folder_path}\"."
                            "\nYou must download it from https://www.kaggle.com/c/titanic/data.")

        x_train = pd.read_csv(os.path.join(data_folder_path, 'train.csv'))
--- a/simulation/decai/simulation/simulate.py
+++ b/simulation/decai/simulation/simulate.py
@ -221,7 +221,7 @@ class Simulator(object):
                self._logger.debug("Remaining training data evaluation: %s", s)

            self._logger.info("Evaluating initial model.")
-            accuracy = self._decai.model.evaluate(x_test, y_test)
+            accuracy = self._decai.model.log_evaluation_details(x_test, y_test)
            self._logger.info("Initial test set accuracy: %0.2f%%", accuracy * 100)
            t = self._time()
            doc.add_next_tick_callback(
@ -239,6 +239,7 @@ class Simulator(object):
            next_data_index = 0
            next_accuracy_plot_time = 1E4
            desc = "Processing agent requests"
+            current_time = 0
            with tqdm(desc=desc,
                      unit_scale=True, mininterval=2, unit=" requests",
                      total=len(x_remaining),
@ -436,9 +437,9 @@ class Simulator(object):

                self._logger.info("Done issuing rewards.")

-            accuracy = self._decai.model.evaluate(x_test, y_test)
+            accuracy = self._decai.model.log_evaluation_details(x_test, y_test)
            doc.add_next_tick_callback(
-                partial(plot_accuracy_cb, t=current_time, a=accuracy))
+                partial(plot_accuracy_cb, t=current_time + 100, a=accuracy))

            with open(save_path, 'w') as f:
                json.dump(save_data, f, separators=(',', ':'))
--- a/simulation/decai/simulation/simulate_fake_news_perceptron.py
+++ b/simulation/decai/simulation/simulate_fake_news_perceptron.py
@ -0,0 +1,78 @@
+import os
+import sys
+from typing import Optional
+
+from injector import Injector
+
+from decai.simulation.contract.classification.perceptron import PerceptronModule
+from decai.simulation.contract.collab_trainer import DefaultCollaborativeTrainerModule
+from decai.simulation.contract.incentive.stakeable import StakeableImModule
+from decai.simulation.data.news_data_loader import NewsDataModule
+from decai.simulation.logging_module import LoggingModule
+from decai.simulation.simulate import Agent, Simulator
+
+# For `bokeh serve`.
+sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
+
+train_size: Optional[int] = None
+test_size: Optional[int] = None
+if train_size is None:
+    init_train_data_portion = 0.08
+else:
+    init_train_data_portion = 100 / train_size
+
+
+def main():
+    # Set up the agents that will act in the simulation.
+    agents = [
+        # Good
+        Agent(address="Good",
+              start_balance=10_000,
+              mean_deposit=50,
+              stdev_deposit=10,
+              mean_update_wait_s=10 * 60,
+              prob_mistake=0.0001,
+              ),
+        # Malicious: A determined agent with the goal of disrupting others.
+        Agent(address="Bad",
+              start_balance=10_000,
+              mean_deposit=100,
+              stdev_deposit=3,
+              mean_update_wait_s=1 * 60 * 60,
+              good=False,
+              ),
+        # One that just calls the model and pays to use the model.
+        Agent(address="Caller",
+              start_balance=30_000,
+              mean_deposit=0,
+              stdev_deposit=0,
+              mean_update_wait_s=2 * 60 * 60,
+              calls_model=True,
+              pay_to_call=50
+              ),
+    ]
+    # No caller (assume free to call).
+    agents = agents[:-1]
+
+    # Set up the data, model, and incentive mechanism.
+    inj = Injector([
+        DefaultCollaborativeTrainerModule,
+        NewsDataModule(),
+        LoggingModule,
+        PerceptronModule,
+        StakeableImModule,
+    ])
+    s = inj.get(Simulator)
+
+    # Start the simulation.
+    s.simulate(agents,
+               baseline_accuracy=0.9173,
+               init_train_data_portion=init_train_data_portion,
+               train_size=train_size,
+               test_size=test_size,
+               )
+
+
+# Run with `bokeh serve PATH`.
+if __name__.startswith('bk_script_'):
+    main()
--- a/simulation/setup.py
+++ b/simulation/setup.py
@ -7,19 +7,13 @@ install_requires = [
    'expiringdict>=1.1.4',
    'injector>=0.16.2',
    'joblib>=0.13.2',
-
-    # Use a specific commit because the latest released version has a bug:
-    # https://github.com/keras-team/keras/issues/12729
-    # Fix: https://github.com/keras-team/keras/pull/12714
-    'keras @ git+https://github.com/keras-team/keras.git@47e1b18c0b7e3ddeef4e9fcded409a55d0479a4f',
-    # Used to be: 'Keras>=2.1',
-    # We need ''Keras>2.2.4', but it doesn't exist yet.
-
+    'keras>=2.3',
    'numpy',

    # Required for saving plots.
    'selenium>=3.141.0',
    'scikit-multiflow>=0.3.0',
+    'spacy>=2.2',

    'tqdm>=4.19',
 ]