[simulation] Fake News Perceptron (#37)

Load fake news data from https://www.kaggle.com/c/fake-news/data
This commit is contained in:
Justin D. Harris 2019-10-10 15:58:50 -04:00 коммит произвёл GitHub
Родитель 8565eb554e
Коммит 6a98ec6ba5
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
10 изменённых файлов: 417 добавлений и 17 удалений

Просмотреть файл

@ -9,15 +9,14 @@ EXPOSE 5006
ENV LC_ALL C.UTF-8
RUN conda create --channel conda-forge --name decai-simulation --yes python=3.7 bokeh ipython mkl mkl-service numpy phantomjs scikit-learn scipy six tensorflow
RUN conda create --channel conda-forge --name decai-simulation --yes python=3.7 bokeh ipython mkl mkl-service numpy pandas phantomjs scikit-learn scipy six tensorflow
RUN conda init bash
RUN echo "conda activate decai-simulation" >> ~/.bashrc
WORKDIR /root/workspace/0xDeCA10B/simulation
COPY setup.py .
RUN conda run --name decai-simulation pip install -e .[test]
RUN echo "conda activate decai-simulation" >> ~/.bashrc
CMD ["bash"]

Просмотреть файл

@ -30,7 +30,7 @@ Despite the malicious efforts, the accuracy can still be maintained and the hone
This section explains how to set up locally, alternatively, you can skip ahead and use a Docker image.
Run:
```bash
conda create --channel conda-forge --name decai-simulation python=3.7 bokeh ipython mkl mkl-service numpy phantomjs scikit-learn scipy six tensorflow
conda create --channel conda-forge --name decai-simulation python=3.7 bokeh ipython mkl mkl-service numpy pandas phantomjs scikit-learn scipy six tensorflow
conda activate decai-simulation
pip install -e .
```

Просмотреть файл

@ -1,3 +1,4 @@
import logging
from abc import ABC, abstractmethod
from decai.simulation.contract.objects import SmartContract
@ -13,12 +14,24 @@ class Classifier(ABC, SmartContract):
"""
Evaluate the model.
:param data: Many data samples.
:param data: Data samples.
:param labels: The ground truth labels for `data`.
:return: The accuracy for the given test set.
"""
pass
@abstractmethod
def log_evaluation_details(self, data, labels, level=logging.INFO) -> float:
"""
Log some evaluation details.
:param data: Data samples.
:param labels: The ground truth labels for `data`.
:param level: The level at which to log.
:return: The accuracy for the given test set.
"""
pass
@abstractmethod
def init_model(self, training_data, labels):
"""

Просмотреть файл

@ -1,3 +1,4 @@
import logging
import os
import time
from dataclasses import dataclass
@ -7,6 +8,7 @@ from typing import Any
import joblib
import numpy as np
from injector import inject, Module, provider, ClassAssistedBuilder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from decai.simulation.contract.classification.classifier import Classifier
@ -33,6 +35,23 @@ class SciKitClassifier(Classifier):
self._logger.debug("Evaluating.")
return self._model.score(data, labels)
def log_evaluation_details(self, data, labels, level=logging.INFO) -> float:
assert self._model is not None, "The model has not been initialized yet."
assert isinstance(data, np.ndarray), "The data must be an array."
assert isinstance(labels, np.ndarray), "The labels must be an array."
self._logger.debug("Evaluating.")
predicted_labels = self._model.predict(data)
result = accuracy_score(labels, predicted_labels)
if self._logger.isEnabledFor(level):
m = confusion_matrix(labels, predicted_labels)
report = classification_report(labels, predicted_labels)
self._logger.log(level,
"Confusion matrix:\n%s"
"\nReport:\n%s"
"\nAccuracy: %0.2f%%",
m, report, result * 100)
return result
def init_model(self, training_data, labels):
assert self._model is None, "The model has already been initialized."
self._logger.debug("Initializing model.")

Просмотреть файл

@ -0,0 +1,266 @@
import itertools
import json
import os
import random
from collections import Counter
from dataclasses import dataclass
from enum import Enum
from logging import Logger
from operator import itemgetter
from typing import Optional, Collection, Tuple
import numpy as np
import pandas as pd
import spacy
from injector import ClassAssistedBuilder, inject, Module, provider, singleton
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.cli import download
from tqdm import tqdm
from .data_loader import DataLoader
class Label(Enum):
RELIABLE = 0
UNRELIABLE = 1
@dataclass
class News:
text: Optional[str]
label: Label
@inject
@dataclass
class _SignalMediaDataLoader(DataLoader):
"""
INCOMPLETE BECAUSE MAPPING THE SOURCE NAMES TO DOMAIN NAMES IS TRICKY.
See https://github.com/aldengolab/fake-news-detection/issues/4
Following logic of https://github.com/aldengolab/fake-news-detection.
Requires the Signal Media dataset from http://research.signalmedia.co/newsir16/signal-dataset.html to be at
simulation/training_data/news/sample-1M.jsonl
and https://github.com/OpenSourcesGroup/opensources with sources.json in simulation/training_data/news/
"""
_logger: Logger
_media_types = {'News'}
def find_source_site(self, source_name: str, sources: Collection[str]) -> Optional[str]:
"""
:param source_name: The name of the source.
:param sources: Source domain names.
:return: The source domain name from `sources` or `None` if no mapping can be found.
"""
# TODO
result = None
return result
def load_data(self, train_size: int = None, test_size: int = None) -> (tuple, tuple):
data_folder_path = os.path.join(__file__, '../../../../training_data/news')
signal_data_path = os.path.join(data_folder_path, 'sample-1M.jsonl')
if not os.path.exists(signal_data_path):
raise Exception(f"Could not find the Signal Media dataset at \"{signal_data_path}\"."
"\nYou must obtain it from http://research.signalmedia.co/newsir16/signal-dataset.html"
f" and follow the instructions to obtain it. Then extract it to \"{signal_data_path}\".")
sources_path = os.path.join(data_folder_path, 'sources.json')
if not os.path.exists(sources_path):
raise Exception(f"Could not find the sources dataset at \"{sources_path}\"."
"\nYou must obtain it from https://github.com/OpenSourcesGroup/opensources and put"
f" sources.json in \"{data_folder_path}\".")
with open(sources_path) as f:
loaded_sources = json.load(f)
sources = dict()
for source, info in loaded_sources.items():
problem_types = (info['type'], info['2nd type'], info['3rd type'])
sources[source] = set(filter(None, problem_types))
self._logger.info("Found %d sources with labels.", len(sources))
# Name: website name in `sources`.
source_mapping = {}
not_found_flag = -1
with open(signal_data_path) as f:
for index, line in tqdm(enumerate(f),
desc="Filtering news articles",
unit_scale=True, mininterval=2, unit=" articles"
):
news = json.loads(line)
news_id = news['id']
title = news['title']
text = news['content']
source = news['source']
# media-type is either "News" or "Blog"
media_type = news['media-type']
published_date = news['published']
if media_type not in self._media_types:
continue
source_site = source_mapping.get(source)
if source_site is None:
source_site = self.find_source_site(source, sources)
if source_site is not None:
source_mapping[source] = source_site
else:
source_mapping[source] = not_found_flag
continue
elif source_site == not_found_flag:
continue
# TODO Use article and set label.
with open(os.path.join(data_folder_path, 'source_mapping.json')) as f:
sorted(source_mapping.items(), key=itemgetter(0))
self._logger.info("Found %d sources in the articles.", len(source_mapping))
# TODO Set up output.
(x_train, y_train), (x_test, y_test) = (None, None), (None, None)
if train_size is not None:
x_train, y_train = x_train[:train_size], y_train[:train_size]
if test_size is not None:
x_test, y_test = x_test[:test_size], y_test[:test_size]
self._logger.info("Done loading news data.")
return (x_train, y_train), (x_test, y_test)
@inject
@dataclass
class NewsDataLoader(DataLoader):
"""
Load data from news sources.
Requires data from https://www.kaggle.com/c/fake-news/data to be saved to "simulation/trainin_data/news/fake-news/train.csv".
"""
_logger: Logger
_train_split = 0.7
_replace_entities_enabled = False
"""
If True, entities will be replaced in text with the entity's label surrounded by angle brackets: "<LABEL>".
Accuracy with replacement: 0.9172
Accuracy without replacement: 0.9173
Disabled because using spaCy is slow, it will be tricky to use spaCy in JavaScript,
and it didn't change the evaluation metrics much.
"""
_entity_types_to_replace = {'PERSON', 'GPE', 'ORG', 'DATE', 'TIME', 'PERCENT',
'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL'}
def __post_init__(self):
spacy_model = 'en_core_web_lg'
download(spacy_model)
self._nlp = spacy.load(spacy_model, disable={'tagger', 'parser', 'textcat'})
def _load_kaggle_data(self, data_folder_path: str) -> Collection[News]:
"""
Load data from https://www.kaggle.com/c/fake-news/data.
"""
# Don't use the test data because it has no labels.
fake_news_data_path = os.path.join(data_folder_path, 'fake-news', 'train.csv')
if not os.path.exists(fake_news_data_path):
raise Exception(f"Could not find the Fake News dataset at \"{fake_news_data_path}\"."
"\nYou must obtain it from https://www.kaggle.com/c/fake-news/data.")
data = pd.read_csv(fake_news_data_path, na_values=dict(text=[]), keep_default_na=False)
result = []
for row in data.itertuples():
label = Label.RELIABLE if row.label == 0 else Label.UNRELIABLE
if len(row.text) > 0:
result.append(News(row.text, label))
# Consistent shuffle to aim for a mostly even distribution of labels.
random.shuffle(result, lambda: 0.618)
return result
def _replace_entities(self, doc) -> str:
# Remove names in text using spaCy.
result = doc.text
for ent in doc.ents[::-1]:
if ent.label_ in self._entity_types_to_replace:
result = result[:ent.start_char] + "<" + ent.label_ + ">" + result[ent.end_char:]
return result
def _pre_process_text(self, doc) -> str:
# TODO Remove name of news sources.
if self._replace_entities_enabled:
result = self._replace_entities(doc)
else:
assert isinstance(doc, str)
result = doc
return result
def _pre_process(self, news_articles: Collection[News], train_size: int, test_size: int) -> \
Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]:
self._logger.info("Getting features for %d articles.", len(news_articles))
# Only use binary features.
ngram_range = (2, 2)
t = TfidfVectorizer(max_features=3000, ngram_range=ngram_range)
test_start = len(news_articles) - test_size
x_train = map(lambda news: news.text, itertools.islice(news_articles, train_size))
x_test = map(lambda news: news.text, itertools.islice(news_articles, test_start, len(news_articles)))
if self._replace_entities:
x_train = self._nlp.pipe(x_train, batch_size=128)
x_test = self._nlp.pipe(x_test, batch_size=128)
x_train = map(self._pre_process_text, x_train)
x_test = map(self._pre_process_text, x_test)
x_train = t.fit_transform(tqdm(x_train,
desc="Processing training data",
total=train_size,
unit_scale=True, mininterval=2,
unit=" articles"
)).toarray()
x_test = t.transform(tqdm(x_test,
desc="Processing testing data",
total=test_size,
unit_scale=True, mininterval=2,
unit=" articles"
)).toarray()
y_train = np.array([news.label.value for news in itertools.islice(news_articles, train_size)], np.int8)
y_test = np.array([news.label.value for news in itertools.islice(news_articles,
test_start, len(news_articles))], np.int8)
self._logger.debug("Training labels: %s", Counter(y_train))
self._logger.debug("Test labels: %s", Counter(y_test))
self._logger.info("Done getting features.")
return (x_train, y_train), (x_test, y_test)
def load_data(self, train_size: int = None, test_size: int = None) -> \
Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]:
self._logger.info("Loading news data.")
data_folder_path = os.path.join(__file__, '../../../../training_data/news')
data = self._load_kaggle_data(data_folder_path)
# Separate train and test data.
if train_size is None:
if test_size is None:
train_size = int(self._train_split * len(data))
else:
train_size = len(data) - test_size
if test_size is None:
test_size = len(data) - train_size
if train_size + test_size > len(data):
raise Exception("There is not enough data for the requested sizes."
f"\n data size: {len(data)}"
f"\n train size: {train_size}"
f"\n test size: {test_size}")
(x_train, y_train), (x_test, y_test) = self._pre_process(data, train_size, test_size)
self._logger.info("Done loading news data.")
return (x_train, y_train), (x_test, y_test)
@dataclass
class NewsDataModule(Module):
@provider
@singleton
def provide_data_loader(self, builder: ClassAssistedBuilder[NewsDataLoader]) -> DataLoader:
return builder.build()

Просмотреть файл

@ -0,0 +1,30 @@
import unittest
from typing import cast
from injector import Injector
from decai.simulation.data.data_loader import DataLoader
from decai.simulation.data.news_data_loader import NewsDataLoader, NewsDataModule
from decai.simulation.logging_module import LoggingModule
class TestNewsDataLoader(unittest.TestCase):
@classmethod
def setUpClass(cls):
inj = Injector([
LoggingModule,
NewsDataModule,
])
cls.data_loader = inj.get(DataLoader)
assert isinstance(cls.data_loader, NewsDataLoader)
cls.data_loader = cast(NewsDataLoader, cls.data_loader)
@unittest.skip("The dataset does not exist on CI test machine.")
def test_load_data(self):
(x_train, y_train), (x_test, y_test) = self.data_loader.load_data()
def test_entities(self):
doc = self.data_loader._nlp("Today, John Smith walked to a store and bought an apple.")
actual = self.data_loader._replace_entities(doc)
self.assertEqual("<DATE>, <PERSON> walked to a store and bought an apple.", actual)

Просмотреть файл

@ -105,7 +105,7 @@ class TitanicDataLoader(DataLoader):
data_folder_path = os.path.join(__file__, '../../../../training_data/titanic')
if not os.path.exists(data_folder_path):
# TODO Attempt to download the data.
raise Exception(f"Could not find Titanic dataset at {data_folder_path}"
raise Exception(f"Could not find Titanic dataset at \"{data_folder_path}\"."
"\nYou must download it from https://www.kaggle.com/c/titanic/data.")
x_train = pd.read_csv(os.path.join(data_folder_path, 'train.csv'))

Просмотреть файл

@ -221,7 +221,7 @@ class Simulator(object):
self._logger.debug("Remaining training data evaluation: %s", s)
self._logger.info("Evaluating initial model.")
accuracy = self._decai.model.evaluate(x_test, y_test)
accuracy = self._decai.model.log_evaluation_details(x_test, y_test)
self._logger.info("Initial test set accuracy: %0.2f%%", accuracy * 100)
t = self._time()
doc.add_next_tick_callback(
@ -239,6 +239,7 @@ class Simulator(object):
next_data_index = 0
next_accuracy_plot_time = 1E4
desc = "Processing agent requests"
current_time = 0
with tqdm(desc=desc,
unit_scale=True, mininterval=2, unit=" requests",
total=len(x_remaining),
@ -436,9 +437,9 @@ class Simulator(object):
self._logger.info("Done issuing rewards.")
accuracy = self._decai.model.evaluate(x_test, y_test)
accuracy = self._decai.model.log_evaluation_details(x_test, y_test)
doc.add_next_tick_callback(
partial(plot_accuracy_cb, t=current_time, a=accuracy))
partial(plot_accuracy_cb, t=current_time + 100, a=accuracy))
with open(save_path, 'w') as f:
json.dump(save_data, f, separators=(',', ':'))

Просмотреть файл

@ -0,0 +1,78 @@
import os
import sys
from typing import Optional
from injector import Injector
from decai.simulation.contract.classification.perceptron import PerceptronModule
from decai.simulation.contract.collab_trainer import DefaultCollaborativeTrainerModule
from decai.simulation.contract.incentive.stakeable import StakeableImModule
from decai.simulation.data.news_data_loader import NewsDataModule
from decai.simulation.logging_module import LoggingModule
from decai.simulation.simulate import Agent, Simulator
# For `bokeh serve`.
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
train_size: Optional[int] = None
test_size: Optional[int] = None
if train_size is None:
init_train_data_portion = 0.08
else:
init_train_data_portion = 100 / train_size
def main():
# Set up the agents that will act in the simulation.
agents = [
# Good
Agent(address="Good",
start_balance=10_000,
mean_deposit=50,
stdev_deposit=10,
mean_update_wait_s=10 * 60,
prob_mistake=0.0001,
),
# Malicious: A determined agent with the goal of disrupting others.
Agent(address="Bad",
start_balance=10_000,
mean_deposit=100,
stdev_deposit=3,
mean_update_wait_s=1 * 60 * 60,
good=False,
),
# One that just calls the model and pays to use the model.
Agent(address="Caller",
start_balance=30_000,
mean_deposit=0,
stdev_deposit=0,
mean_update_wait_s=2 * 60 * 60,
calls_model=True,
pay_to_call=50
),
]
# No caller (assume free to call).
agents = agents[:-1]
# Set up the data, model, and incentive mechanism.
inj = Injector([
DefaultCollaborativeTrainerModule,
NewsDataModule(),
LoggingModule,
PerceptronModule,
StakeableImModule,
])
s = inj.get(Simulator)
# Start the simulation.
s.simulate(agents,
baseline_accuracy=0.9173,
init_train_data_portion=init_train_data_portion,
train_size=train_size,
test_size=test_size,
)
# Run with `bokeh serve PATH`.
if __name__.startswith('bk_script_'):
main()

Просмотреть файл

@ -7,19 +7,13 @@ install_requires = [
'expiringdict>=1.1.4',
'injector>=0.16.2',
'joblib>=0.13.2',
# Use a specific commit because the latest released version has a bug:
# https://github.com/keras-team/keras/issues/12729
# Fix: https://github.com/keras-team/keras/pull/12714
'keras @ git+https://github.com/keras-team/keras.git@47e1b18c0b7e3ddeef4e9fcded409a55d0479a4f',
# Used to be: 'Keras>=2.1',
# We need ''Keras>2.2.4', but it doesn't exist yet.
'keras>=2.3',
'numpy',
# Required for saving plots.
'selenium>=3.141.0',
'scikit-multiflow>=0.3.0',
'spacy>=2.2',
'tqdm>=4.19',
]