This commit is contained in:
Gani Nazirov 2018-10-23 14:48:52 -07:00
Родитель 9f188d8438
Коммит 7d659af568
6 изменённых файлов: 51 добавлений и 46 удалений

1
.gitignore поставляемый
Просмотреть файл

@ -347,3 +347,4 @@ _doc_report.txt
data.csv
data.txt
/build/TestCoverageReport

Просмотреть файл

@ -299,9 +299,13 @@ set TestsPath1=%PackagePath%\tests
set TestsPath2=%__currentScriptDir%src\python\tests
set ReportPath=%__currentScriptDir%build\TestCoverageReport
call "%PythonExe%" -m pytest --verbose --maxfail=1000 --capture=sys "%TestsPath1%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%"
if errorlevel 1 (
goto :Exit_Error
)
call "%PythonExe%" -m pytest --verbose --maxfail=1000 --capture=sys "%TestsPath2%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%"
goto :Exit_Success
if errorlevel 1 (
goto :Exit_Error
)
:Exit_Success
endlocal

Просмотреть файл

@ -9,9 +9,9 @@ import pandas as pd
from nimbusml import Pipeline
from nimbusml.ensemble import LightGbmClassifier
from nimbusml.feature_extraction.text import NGramFeaturizer
from nimbusml.internal.entrypoints._ngramextractor_ngram import n_gram
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_array_almost_equal
def transform_data(data=None, datatype=None):
@ -34,7 +34,7 @@ def train_data_type_single(
"Talk about second",
"Thrid one",
"Final example."]
model = NGramFeaturizer(word_feature_extractor=n_gram())
model = NGramFeaturizer()
data_with_new_type = transform_data(data, fit_X_type)
model.fit(data_with_new_type)
test_data_with_new_type = transform_data(data, predict_X_type)
@ -49,7 +49,7 @@ def train_data_type_ppl(fit_X_type=None, fit_Y_type=None, predict_X_type=None):
"Final example."]
label = [1, 0, 1, 1]
model = Pipeline([
NGramFeaturizer(word_feature_extractor=n_gram()),
NGramFeaturizer(),
LightGbmClassifier(min_data_per_leaf=1, n_thread=1)
])
data_with_new_type = transform_data(data, fit_X_type)
@ -66,127 +66,127 @@ class TestTextDataType(unittest.TestCase):
def test_check_text_datatype_single_list_list_series(self):
result = train_data_type_single("list", "list", "series")
assert len(result) == 4
assert len(result.columns) == 11
assert len(result.columns) == 66
assert all([col.startswith('F0') for col in result.columns])
def test_check_text_datatype_single_series_list_series(self):
result = train_data_type_single("series", "list", "series")
assert len(result) == 4
assert len(result.columns) == 11
assert len(result.columns) == 66
assert all([col.startswith('F0') for col in result.columns])
def test_check_text_datatype_single_series_list_list(self):
result = train_data_type_single("series", "list", "list")
assert len(result) == 4
assert len(result.columns) == 11
assert len(result.columns) == 66
assert all([col.startswith('F0') for col in result.columns])
def test_check_text_datatype_single_array_list_series(self):
result = train_data_type_single("array", "list", "series")
assert len(result) == 4
assert len(result.columns) == 11
assert len(result.columns) == 66
assert all([col.startswith('F0') for col in result.columns])
def test_check_text_datatype_single_series_array_dataframe(self):
result = train_data_type_single("series", "array", "dataframe")
assert len(result) == 4
assert len(result.columns) == 11
assert len(result.columns) == 66
assert all([col.startswith('F0') for col in result.columns])
def test_check_text_datatype_single_array_series_series(self):
result = train_data_type_single("array", "series", "series")
assert len(result) == 4
assert len(result.columns) == 11
assert len(result.columns) == 66
assert all([col.startswith('F0') for col in result.columns])
def test_check_text_datatype_single_dataframe_list_series(self):
result = train_data_type_single("dataframe", "list", "series")
assert len(result) == 4
assert len(result.columns) == 11
assert len(result.columns) == 66
assert all([col.startswith('F0') for col in result.columns])
def test_check_text_datatype_single_series_series_dataframe(self):
result = train_data_type_single("series", "series", "dataframe")
assert len(result) == 4
assert len(result.columns) == 11
assert len(result.columns) == 66
assert all([col.startswith('F0') for col in result.columns])
def test_check_text_datatype_single_dataframe_series_list(self):
result = train_data_type_single("dataframe", "series", "list")
assert len(result) == 4
assert len(result.columns) == 11
assert len(result.columns) == 66
assert all([col.startswith('F0') for col in result.columns])
def test_check_text_datatype_ppl_series_list_array(self):
result, scores, metrics = train_data_type_ppl(
"series", "list", "array")
assert len(result) == 4
assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])
def test_check_text_datatype_ppl_list_series_dataframe(self):
result, scores, metrics = train_data_type_ppl(
"list", "series", "dataframe")
assert len(result) == 4
assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])
def test_check_text_datatype_ppl_list_list_series(self):
result, scores, metrics = train_data_type_ppl("list", "list", "series")
assert len(result) == 4
assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])
def test_check_text_datatype_ppl_array_series_array(self):
result, scores, metrics = train_data_type_ppl(
"array", "series", "array")
assert len(result) == 4
assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])
def test_check_text_datatype_ppl_series_array_dataframe(self):
result, scores, metrics = train_data_type_ppl(
"series", "array", "dataframe")
assert len(result) == 4
assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])
def test_check_text_datatype_ppl_array_series_list(self):
result, scores, metrics = train_data_type_ppl(
"array", "series", "list")
assert len(result) == 4
assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])
def test_check_text_datatype_ppl_dataframe_list_series(self):
result, scores, metrics = train_data_type_ppl(
"dataframe", "list", "series")
assert len(result) == 4
assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])
def test_check_text_datatype_ppl_series_series_dataframe(self):
result, scores, metrics = train_data_type_ppl(
"series", "series", "dataframe")
assert len(result) == 4
assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])
def test_check_text_datatype_ppl_dataframe_series_series(self):
result, scores, metrics = train_data_type_ppl(
"dataframe", "series", "series")
assert len(result) == 4
assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])
if __name__ == '__main__':

Просмотреть файл

@ -38,7 +38,7 @@ class TestNGramFeaturizer(unittest.TestCase):
X_train = texttransform.fit_transform(X_train[:100])
sum = X_train.iloc[:].sum().sum()
print(sum)
assert_equal(sum, 4594, "sum of all features is incorrect!")
assert_equal(sum, 30513, "sum of all features is incorrect!")
if __name__ == '__main__':

Просмотреть файл

@ -91,7 +91,7 @@ class TestNGramFeaturizer(unittest.TestCase):
textt = NGramFeaturizer(word_feature_extractor=n_gram()) << 'review'
X = textt.fit_transform(X)
assert X.shape == (25, 21)
assert X.shape == (25, 116)
mymodel = LogisticRegressionBinaryClassifier().fit(X, y, verbose=0)
X_test = textt.transform(test_reviews)
@ -180,7 +180,7 @@ class TestNGramFeaturizer(unittest.TestCase):
'outg': ['review']}
X = textt.fit_transform(X)
assert X.shape == (25, 22)
assert X.shape == (25, 117)
# columns ordering changed between 0.22 and 0.23
assert 'review' in (X.columns[0], X.columns[-1])
X = X.drop('review', axis=1)
@ -204,7 +204,7 @@ class TestNGramFeaturizer(unittest.TestCase):
columns={'features': ['id', 'education']})
features = xf.fit_transform(data)
assert features.shape == (248, 259)
assert features.shape == (248, 652)
def test_ngramfeaturizer_multi(self):

Просмотреть файл

@ -146,4 +146,4 @@ class TestSyntaxOneHotVectorizer(unittest.TestCase):
ng4 = NGramFeaturizer(word_feature_extractor=n_gram()) << {
'out1': ['education1', 'education2']}
output4 = ng4.fit_transform(X)
assert output4.shape == (5, 7)
assert output4.shape == (5, 13)