[simulation] Fake News Perceptron (#37)
Load fake news data from https://www.kaggle.com/c/fake-news/data
This commit is contained in:
Родитель
8565eb554e
Коммит
6a98ec6ba5
|
@ -9,15 +9,14 @@ EXPOSE 5006
|
|||
|
||||
ENV LC_ALL C.UTF-8
|
||||
|
||||
RUN conda create --channel conda-forge --name decai-simulation --yes python=3.7 bokeh ipython mkl mkl-service numpy phantomjs scikit-learn scipy six tensorflow
|
||||
RUN conda create --channel conda-forge --name decai-simulation --yes python=3.7 bokeh ipython mkl mkl-service numpy pandas phantomjs scikit-learn scipy six tensorflow
|
||||
|
||||
RUN conda init bash
|
||||
RUN echo "conda activate decai-simulation" >> ~/.bashrc
|
||||
|
||||
WORKDIR /root/workspace/0xDeCA10B/simulation
|
||||
COPY setup.py .
|
||||
|
||||
RUN conda run --name decai-simulation pip install -e .[test]
|
||||
|
||||
RUN echo "conda activate decai-simulation" >> ~/.bashrc
|
||||
|
||||
CMD ["bash"]
|
||||
|
|
|
@ -30,7 +30,7 @@ Despite the malicious efforts, the accuracy can still be maintained and the hone
|
|||
This section explains how to set up locally, alternatively, you can skip ahead and use a Docker image.
|
||||
Run:
|
||||
```bash
|
||||
conda create --channel conda-forge --name decai-simulation python=3.7 bokeh ipython mkl mkl-service numpy phantomjs scikit-learn scipy six tensorflow
|
||||
conda create --channel conda-forge --name decai-simulation python=3.7 bokeh ipython mkl mkl-service numpy pandas phantomjs scikit-learn scipy six tensorflow
|
||||
conda activate decai-simulation
|
||||
pip install -e .
|
||||
```
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from decai.simulation.contract.objects import SmartContract
|
||||
|
@ -13,12 +14,24 @@ class Classifier(ABC, SmartContract):
|
|||
"""
|
||||
Evaluate the model.
|
||||
|
||||
:param data: Many data samples.
|
||||
:param data: Data samples.
|
||||
:param labels: The ground truth labels for `data`.
|
||||
:return: The accuracy for the given test set.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def log_evaluation_details(self, data, labels, level=logging.INFO) -> float:
|
||||
"""
|
||||
Log some evaluation details.
|
||||
|
||||
:param data: Data samples.
|
||||
:param labels: The ground truth labels for `data`.
|
||||
:param level: The level at which to log.
|
||||
:return: The accuracy for the given test set.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def init_model(self, training_data, labels):
|
||||
"""
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
|
@ -7,6 +8,7 @@ from typing import Any
|
|||
import joblib
|
||||
import numpy as np
|
||||
from injector import inject, Module, provider, ClassAssistedBuilder
|
||||
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
||||
|
||||
from decai.simulation.contract.classification.classifier import Classifier
|
||||
|
||||
|
@ -33,6 +35,23 @@ class SciKitClassifier(Classifier):
|
|||
self._logger.debug("Evaluating.")
|
||||
return self._model.score(data, labels)
|
||||
|
||||
def log_evaluation_details(self, data, labels, level=logging.INFO) -> float:
|
||||
assert self._model is not None, "The model has not been initialized yet."
|
||||
assert isinstance(data, np.ndarray), "The data must be an array."
|
||||
assert isinstance(labels, np.ndarray), "The labels must be an array."
|
||||
self._logger.debug("Evaluating.")
|
||||
predicted_labels = self._model.predict(data)
|
||||
result = accuracy_score(labels, predicted_labels)
|
||||
if self._logger.isEnabledFor(level):
|
||||
m = confusion_matrix(labels, predicted_labels)
|
||||
report = classification_report(labels, predicted_labels)
|
||||
self._logger.log(level,
|
||||
"Confusion matrix:\n%s"
|
||||
"\nReport:\n%s"
|
||||
"\nAccuracy: %0.2f%%",
|
||||
m, report, result * 100)
|
||||
return result
|
||||
|
||||
def init_model(self, training_data, labels):
|
||||
assert self._model is None, "The model has already been initialized."
|
||||
self._logger.debug("Initializing model.")
|
||||
|
|
|
@ -0,0 +1,266 @@
|
|||
import itertools
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from logging import Logger
|
||||
from operator import itemgetter
|
||||
from typing import Optional, Collection, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import spacy
|
||||
from injector import ClassAssistedBuilder, inject, Module, provider, singleton
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from spacy.cli import download
|
||||
from tqdm import tqdm
|
||||
|
||||
from .data_loader import DataLoader
|
||||
|
||||
|
||||
class Label(Enum):
|
||||
RELIABLE = 0
|
||||
UNRELIABLE = 1
|
||||
|
||||
|
||||
@dataclass
|
||||
class News:
|
||||
text: Optional[str]
|
||||
label: Label
|
||||
|
||||
|
||||
@inject
|
||||
@dataclass
|
||||
class _SignalMediaDataLoader(DataLoader):
|
||||
"""
|
||||
INCOMPLETE BECAUSE MAPPING THE SOURCE NAMES TO DOMAIN NAMES IS TRICKY.
|
||||
See https://github.com/aldengolab/fake-news-detection/issues/4
|
||||
|
||||
Following logic of https://github.com/aldengolab/fake-news-detection.
|
||||
Requires the Signal Media dataset from http://research.signalmedia.co/newsir16/signal-dataset.html to be at
|
||||
simulation/training_data/news/sample-1M.jsonl
|
||||
and https://github.com/OpenSourcesGroup/opensources with sources.json in simulation/training_data/news/
|
||||
"""
|
||||
_logger: Logger
|
||||
_media_types = {'News'}
|
||||
|
||||
def find_source_site(self, source_name: str, sources: Collection[str]) -> Optional[str]:
|
||||
"""
|
||||
:param source_name: The name of the source.
|
||||
:param sources: Source domain names.
|
||||
:return: The source domain name from `sources` or `None` if no mapping can be found.
|
||||
"""
|
||||
# TODO
|
||||
result = None
|
||||
return result
|
||||
|
||||
def load_data(self, train_size: int = None, test_size: int = None) -> (tuple, tuple):
|
||||
data_folder_path = os.path.join(__file__, '../../../../training_data/news')
|
||||
signal_data_path = os.path.join(data_folder_path, 'sample-1M.jsonl')
|
||||
if not os.path.exists(signal_data_path):
|
||||
raise Exception(f"Could not find the Signal Media dataset at \"{signal_data_path}\"."
|
||||
"\nYou must obtain it from http://research.signalmedia.co/newsir16/signal-dataset.html"
|
||||
f" and follow the instructions to obtain it. Then extract it to \"{signal_data_path}\".")
|
||||
|
||||
sources_path = os.path.join(data_folder_path, 'sources.json')
|
||||
if not os.path.exists(sources_path):
|
||||
raise Exception(f"Could not find the sources dataset at \"{sources_path}\"."
|
||||
"\nYou must obtain it from https://github.com/OpenSourcesGroup/opensources and put"
|
||||
f" sources.json in \"{data_folder_path}\".")
|
||||
|
||||
with open(sources_path) as f:
|
||||
loaded_sources = json.load(f)
|
||||
sources = dict()
|
||||
for source, info in loaded_sources.items():
|
||||
problem_types = (info['type'], info['2nd type'], info['3rd type'])
|
||||
sources[source] = set(filter(None, problem_types))
|
||||
self._logger.info("Found %d sources with labels.", len(sources))
|
||||
|
||||
# Name: website name in `sources`.
|
||||
source_mapping = {}
|
||||
not_found_flag = -1
|
||||
with open(signal_data_path) as f:
|
||||
for index, line in tqdm(enumerate(f),
|
||||
desc="Filtering news articles",
|
||||
unit_scale=True, mininterval=2, unit=" articles"
|
||||
):
|
||||
news = json.loads(line)
|
||||
news_id = news['id']
|
||||
title = news['title']
|
||||
text = news['content']
|
||||
source = news['source']
|
||||
# media-type is either "News" or "Blog"
|
||||
media_type = news['media-type']
|
||||
published_date = news['published']
|
||||
if media_type not in self._media_types:
|
||||
continue
|
||||
source_site = source_mapping.get(source)
|
||||
if source_site is None:
|
||||
source_site = self.find_source_site(source, sources)
|
||||
if source_site is not None:
|
||||
source_mapping[source] = source_site
|
||||
else:
|
||||
source_mapping[source] = not_found_flag
|
||||
continue
|
||||
elif source_site == not_found_flag:
|
||||
continue
|
||||
# TODO Use article and set label.
|
||||
|
||||
with open(os.path.join(data_folder_path, 'source_mapping.json')) as f:
|
||||
sorted(source_mapping.items(), key=itemgetter(0))
|
||||
|
||||
self._logger.info("Found %d sources in the articles.", len(source_mapping))
|
||||
|
||||
# TODO Set up output.
|
||||
(x_train, y_train), (x_test, y_test) = (None, None), (None, None)
|
||||
if train_size is not None:
|
||||
x_train, y_train = x_train[:train_size], y_train[:train_size]
|
||||
if test_size is not None:
|
||||
x_test, y_test = x_test[:test_size], y_test[:test_size]
|
||||
|
||||
self._logger.info("Done loading news data.")
|
||||
return (x_train, y_train), (x_test, y_test)
|
||||
|
||||
|
||||
@inject
|
||||
@dataclass
|
||||
class NewsDataLoader(DataLoader):
|
||||
"""
|
||||
Load data from news sources.
|
||||
|
||||
Requires data from https://www.kaggle.com/c/fake-news/data to be saved to "simulation/trainin_data/news/fake-news/train.csv".
|
||||
"""
|
||||
|
||||
_logger: Logger
|
||||
_train_split = 0.7
|
||||
|
||||
_replace_entities_enabled = False
|
||||
"""
|
||||
If True, entities will be replaced in text with the entity's label surrounded by angle brackets: "<LABEL>".
|
||||
Accuracy with replacement: 0.9172
|
||||
Accuracy without replacement: 0.9173
|
||||
|
||||
Disabled because using spaCy is slow, it will be tricky to use spaCy in JavaScript,
|
||||
and it didn't change the evaluation metrics much.
|
||||
"""
|
||||
|
||||
_entity_types_to_replace = {'PERSON', 'GPE', 'ORG', 'DATE', 'TIME', 'PERCENT',
|
||||
'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL'}
|
||||
|
||||
def __post_init__(self):
|
||||
spacy_model = 'en_core_web_lg'
|
||||
download(spacy_model)
|
||||
self._nlp = spacy.load(spacy_model, disable={'tagger', 'parser', 'textcat'})
|
||||
|
||||
|
||||
def _load_kaggle_data(self, data_folder_path: str) -> Collection[News]:
|
||||
"""
|
||||
Load data from https://www.kaggle.com/c/fake-news/data.
|
||||
"""
|
||||
# Don't use the test data because it has no labels.
|
||||
fake_news_data_path = os.path.join(data_folder_path, 'fake-news', 'train.csv')
|
||||
if not os.path.exists(fake_news_data_path):
|
||||
raise Exception(f"Could not find the Fake News dataset at \"{fake_news_data_path}\"."
|
||||
"\nYou must obtain it from https://www.kaggle.com/c/fake-news/data.")
|
||||
data = pd.read_csv(fake_news_data_path, na_values=dict(text=[]), keep_default_na=False)
|
||||
result = []
|
||||
for row in data.itertuples():
|
||||
label = Label.RELIABLE if row.label == 0 else Label.UNRELIABLE
|
||||
if len(row.text) > 0:
|
||||
result.append(News(row.text, label))
|
||||
|
||||
# Consistent shuffle to aim for a mostly even distribution of labels.
|
||||
random.shuffle(result, lambda: 0.618)
|
||||
|
||||
return result
|
||||
|
||||
def _replace_entities(self, doc) -> str:
|
||||
# Remove names in text using spaCy.
|
||||
result = doc.text
|
||||
for ent in doc.ents[::-1]:
|
||||
if ent.label_ in self._entity_types_to_replace:
|
||||
result = result[:ent.start_char] + "<" + ent.label_ + ">" + result[ent.end_char:]
|
||||
return result
|
||||
|
||||
def _pre_process_text(self, doc) -> str:
|
||||
# TODO Remove name of news sources.
|
||||
if self._replace_entities_enabled:
|
||||
result = self._replace_entities(doc)
|
||||
else:
|
||||
assert isinstance(doc, str)
|
||||
result = doc
|
||||
return result
|
||||
|
||||
def _pre_process(self, news_articles: Collection[News], train_size: int, test_size: int) -> \
|
||||
Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]:
|
||||
self._logger.info("Getting features for %d articles.", len(news_articles))
|
||||
# Only use binary features.
|
||||
ngram_range = (2, 2)
|
||||
t = TfidfVectorizer(max_features=3000, ngram_range=ngram_range)
|
||||
test_start = len(news_articles) - test_size
|
||||
|
||||
x_train = map(lambda news: news.text, itertools.islice(news_articles, train_size))
|
||||
x_test = map(lambda news: news.text, itertools.islice(news_articles, test_start, len(news_articles)))
|
||||
if self._replace_entities:
|
||||
x_train = self._nlp.pipe(x_train, batch_size=128)
|
||||
x_test = self._nlp.pipe(x_test, batch_size=128)
|
||||
|
||||
x_train = map(self._pre_process_text, x_train)
|
||||
x_test = map(self._pre_process_text, x_test)
|
||||
|
||||
x_train = t.fit_transform(tqdm(x_train,
|
||||
desc="Processing training data",
|
||||
total=train_size,
|
||||
unit_scale=True, mininterval=2,
|
||||
unit=" articles"
|
||||
)).toarray()
|
||||
x_test = t.transform(tqdm(x_test,
|
||||
desc="Processing testing data",
|
||||
total=test_size,
|
||||
unit_scale=True, mininterval=2,
|
||||
unit=" articles"
|
||||
)).toarray()
|
||||
|
||||
y_train = np.array([news.label.value for news in itertools.islice(news_articles, train_size)], np.int8)
|
||||
y_test = np.array([news.label.value for news in itertools.islice(news_articles,
|
||||
test_start, len(news_articles))], np.int8)
|
||||
self._logger.debug("Training labels: %s", Counter(y_train))
|
||||
self._logger.debug("Test labels: %s", Counter(y_test))
|
||||
self._logger.info("Done getting features.")
|
||||
return (x_train, y_train), (x_test, y_test)
|
||||
|
||||
def load_data(self, train_size: int = None, test_size: int = None) -> \
|
||||
Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]:
|
||||
self._logger.info("Loading news data.")
|
||||
data_folder_path = os.path.join(__file__, '../../../../training_data/news')
|
||||
|
||||
data = self._load_kaggle_data(data_folder_path)
|
||||
|
||||
# Separate train and test data.
|
||||
if train_size is None:
|
||||
if test_size is None:
|
||||
train_size = int(self._train_split * len(data))
|
||||
else:
|
||||
train_size = len(data) - test_size
|
||||
if test_size is None:
|
||||
test_size = len(data) - train_size
|
||||
if train_size + test_size > len(data):
|
||||
raise Exception("There is not enough data for the requested sizes."
|
||||
f"\n data size: {len(data)}"
|
||||
f"\n train size: {train_size}"
|
||||
f"\n test size: {test_size}")
|
||||
|
||||
(x_train, y_train), (x_test, y_test) = self._pre_process(data, train_size, test_size)
|
||||
self._logger.info("Done loading news data.")
|
||||
return (x_train, y_train), (x_test, y_test)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NewsDataModule(Module):
|
||||
|
||||
@provider
|
||||
@singleton
|
||||
def provide_data_loader(self, builder: ClassAssistedBuilder[NewsDataLoader]) -> DataLoader:
|
||||
return builder.build()
|
|
@ -0,0 +1,30 @@
|
|||
import unittest
|
||||
from typing import cast
|
||||
|
||||
from injector import Injector
|
||||
|
||||
from decai.simulation.data.data_loader import DataLoader
|
||||
from decai.simulation.data.news_data_loader import NewsDataLoader, NewsDataModule
|
||||
from decai.simulation.logging_module import LoggingModule
|
||||
|
||||
|
||||
class TestNewsDataLoader(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
inj = Injector([
|
||||
LoggingModule,
|
||||
NewsDataModule,
|
||||
])
|
||||
|
||||
cls.data_loader = inj.get(DataLoader)
|
||||
assert isinstance(cls.data_loader, NewsDataLoader)
|
||||
cls.data_loader = cast(NewsDataLoader, cls.data_loader)
|
||||
|
||||
@unittest.skip("The dataset does not exist on CI test machine.")
|
||||
def test_load_data(self):
|
||||
(x_train, y_train), (x_test, y_test) = self.data_loader.load_data()
|
||||
|
||||
def test_entities(self):
|
||||
doc = self.data_loader._nlp("Today, John Smith walked to a store and bought an apple.")
|
||||
actual = self.data_loader._replace_entities(doc)
|
||||
self.assertEqual("<DATE>, <PERSON> walked to a store and bought an apple.", actual)
|
|
@ -105,7 +105,7 @@ class TitanicDataLoader(DataLoader):
|
|||
data_folder_path = os.path.join(__file__, '../../../../training_data/titanic')
|
||||
if not os.path.exists(data_folder_path):
|
||||
# TODO Attempt to download the data.
|
||||
raise Exception(f"Could not find Titanic dataset at {data_folder_path}"
|
||||
raise Exception(f"Could not find Titanic dataset at \"{data_folder_path}\"."
|
||||
"\nYou must download it from https://www.kaggle.com/c/titanic/data.")
|
||||
|
||||
x_train = pd.read_csv(os.path.join(data_folder_path, 'train.csv'))
|
||||
|
|
|
@ -221,7 +221,7 @@ class Simulator(object):
|
|||
self._logger.debug("Remaining training data evaluation: %s", s)
|
||||
|
||||
self._logger.info("Evaluating initial model.")
|
||||
accuracy = self._decai.model.evaluate(x_test, y_test)
|
||||
accuracy = self._decai.model.log_evaluation_details(x_test, y_test)
|
||||
self._logger.info("Initial test set accuracy: %0.2f%%", accuracy * 100)
|
||||
t = self._time()
|
||||
doc.add_next_tick_callback(
|
||||
|
@ -239,6 +239,7 @@ class Simulator(object):
|
|||
next_data_index = 0
|
||||
next_accuracy_plot_time = 1E4
|
||||
desc = "Processing agent requests"
|
||||
current_time = 0
|
||||
with tqdm(desc=desc,
|
||||
unit_scale=True, mininterval=2, unit=" requests",
|
||||
total=len(x_remaining),
|
||||
|
@ -436,9 +437,9 @@ class Simulator(object):
|
|||
|
||||
self._logger.info("Done issuing rewards.")
|
||||
|
||||
accuracy = self._decai.model.evaluate(x_test, y_test)
|
||||
accuracy = self._decai.model.log_evaluation_details(x_test, y_test)
|
||||
doc.add_next_tick_callback(
|
||||
partial(plot_accuracy_cb, t=current_time, a=accuracy))
|
||||
partial(plot_accuracy_cb, t=current_time + 100, a=accuracy))
|
||||
|
||||
with open(save_path, 'w') as f:
|
||||
json.dump(save_data, f, separators=(',', ':'))
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
import os
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from injector import Injector
|
||||
|
||||
from decai.simulation.contract.classification.perceptron import PerceptronModule
|
||||
from decai.simulation.contract.collab_trainer import DefaultCollaborativeTrainerModule
|
||||
from decai.simulation.contract.incentive.stakeable import StakeableImModule
|
||||
from decai.simulation.data.news_data_loader import NewsDataModule
|
||||
from decai.simulation.logging_module import LoggingModule
|
||||
from decai.simulation.simulate import Agent, Simulator
|
||||
|
||||
# For `bokeh serve`.
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
|
||||
|
||||
train_size: Optional[int] = None
|
||||
test_size: Optional[int] = None
|
||||
if train_size is None:
|
||||
init_train_data_portion = 0.08
|
||||
else:
|
||||
init_train_data_portion = 100 / train_size
|
||||
|
||||
|
||||
def main():
|
||||
# Set up the agents that will act in the simulation.
|
||||
agents = [
|
||||
# Good
|
||||
Agent(address="Good",
|
||||
start_balance=10_000,
|
||||
mean_deposit=50,
|
||||
stdev_deposit=10,
|
||||
mean_update_wait_s=10 * 60,
|
||||
prob_mistake=0.0001,
|
||||
),
|
||||
# Malicious: A determined agent with the goal of disrupting others.
|
||||
Agent(address="Bad",
|
||||
start_balance=10_000,
|
||||
mean_deposit=100,
|
||||
stdev_deposit=3,
|
||||
mean_update_wait_s=1 * 60 * 60,
|
||||
good=False,
|
||||
),
|
||||
# One that just calls the model and pays to use the model.
|
||||
Agent(address="Caller",
|
||||
start_balance=30_000,
|
||||
mean_deposit=0,
|
||||
stdev_deposit=0,
|
||||
mean_update_wait_s=2 * 60 * 60,
|
||||
calls_model=True,
|
||||
pay_to_call=50
|
||||
),
|
||||
]
|
||||
# No caller (assume free to call).
|
||||
agents = agents[:-1]
|
||||
|
||||
# Set up the data, model, and incentive mechanism.
|
||||
inj = Injector([
|
||||
DefaultCollaborativeTrainerModule,
|
||||
NewsDataModule(),
|
||||
LoggingModule,
|
||||
PerceptronModule,
|
||||
StakeableImModule,
|
||||
])
|
||||
s = inj.get(Simulator)
|
||||
|
||||
# Start the simulation.
|
||||
s.simulate(agents,
|
||||
baseline_accuracy=0.9173,
|
||||
init_train_data_portion=init_train_data_portion,
|
||||
train_size=train_size,
|
||||
test_size=test_size,
|
||||
)
|
||||
|
||||
|
||||
# Run with `bokeh serve PATH`.
|
||||
if __name__.startswith('bk_script_'):
|
||||
main()
|
|
@ -7,19 +7,13 @@ install_requires = [
|
|||
'expiringdict>=1.1.4',
|
||||
'injector>=0.16.2',
|
||||
'joblib>=0.13.2',
|
||||
|
||||
# Use a specific commit because the latest released version has a bug:
|
||||
# https://github.com/keras-team/keras/issues/12729
|
||||
# Fix: https://github.com/keras-team/keras/pull/12714
|
||||
'keras @ git+https://github.com/keras-team/keras.git@47e1b18c0b7e3ddeef4e9fcded409a55d0479a4f',
|
||||
# Used to be: 'Keras>=2.1',
|
||||
# We need ''Keras>2.2.4', but it doesn't exist yet.
|
||||
|
||||
'keras>=2.3',
|
||||
'numpy',
|
||||
|
||||
# Required for saving plots.
|
||||
'selenium>=3.141.0',
|
||||
'scikit-multiflow>=0.3.0',
|
||||
'spacy>=2.2',
|
||||
|
||||
'tqdm>=4.19',
|
||||
]
|
||||
|
|
Загрузка…
Ссылка в новой задаче