Merge pull request #3 from microsoft/chjinche/add_custom_parser_example

Add FreeForm2Parser as customized parser example
This commit is contained in:
chjinche 2021-11-05 23:04:38 +08:00 коммит произвёл GitHub
Родитель b52111e66a ee6a4783d1
Коммит 6de49eff7e
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
14 изменённых файлов: 10026 добавлений и 41 удалений

5
.gitignore поставляемый
Просмотреть файл

@ -1 +1,4 @@
build
build
*egg-info
dist
*.so

21
CMakeLists.txt Normal file
Просмотреть файл

@ -0,0 +1,21 @@
cmake_minimum_required(VERSION 3.15.0 FATAL_ERROR)
project(custom_transform)
set(LIB_NAME "_custom_parser")
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/external_libs/LightGBM)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/src/)
set(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
add_library(${LIB_NAME} SHARED ${CMAKE_CURRENT_SOURCE_DIR}/examples/freeform2_parser.cpp ${CMAKE_CURRENT_SOURCE_DIR}/external_libs/LightGBM/include)
target_include_directories(${LIB_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/external_libs/LightGBM/include)
target_link_libraries(${LIB_NAME}
-Wl,--no-as-needed
-Wl,--start-group
_lightgbm
_transform
-Wl,--end-group
)

Просмотреть файл

@ -0,0 +1,49 @@
#include <LightGBM/dataset.h>
#include <LightGBM/utils/log.h>
#include <boost/algorithm/string/split.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/lexical_cast.hpp>
#include "TransformProcessor.h"
using namespace std;
namespace LightGBM {
class FreeForm2Parser: public Parser {
public:
FreeForm2Parser(std::string config_str) {
string label_key = "labelId:";
string expr_key = "transform:\n";
string header_key = "header:\n";
size_t start_pos = config_str.find(label_key);
config_str.erase(0, start_pos);
size_t end_pos = config_str.find("\n");
string label_line = config_str.substr(label_key.size(), end_pos);
int label_idx = std::stod(label_line);
start_pos = config_str.find(expr_key);
config_str.erase(0, start_pos);
end_pos = config_str.find("end of transform");
string transform_str = config_str.substr(expr_key.size(), end_pos);
start_pos = config_str.find(header_key);
config_str.erase(0, start_pos);
end_pos = config_str.find("end of header");
string header_str = config_str.substr(header_key.size(), end_pos);
Log::Info("Initializing transform processor.");
transform_.reset(new TransformProcessor(transform_str, header_str, label_idx));
}
inline void ParseOneLine(const char* str, std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
vector<string> out_feature_strs;
out_feature_strs.clear();
transform_->Parse(str, &out_feature_strs, out_label, "\t");
transform_->Apply(&out_feature_strs, out_features);
}
inline int NumFeatures() const override {return transform_->GetFeatureCount();}
private:
std::unique_ptr<TransformProcessor> transform_;
};
Parser* CreateObject(std::string config_str) { return new FreeForm2Parser(config_str);}
ParserReflector reflector("FreeForm2Parser", CreateObject);
}

Просмотреть файл

@ -0,0 +1,7 @@
import ctypes
from pathlib import Path
CUSTOM_PARSER_LIB_NAME = 'lib_custom_parser.so'
for p in ['lib_transform.so', 'lib_lightgbm.so', CUSTOM_PARSER_LIB_NAME]:
print(p)
ctypes.cdll.LoadLibrary(str(Path(__file__).resolve().parent / p))

Просмотреть файл

@ -0,0 +1,17 @@
# run command `sh ./scripts/publish_python_package.sh` in repo root dir.
lgb_python_pkg_dir="./external_libs/LightGBM/python-package"
# compile transformation, lightgbm, and customized parser libs.
# rm -rf build && mkdir build &&
cd build && cmake ../ && make -j4 && cd ../ || exit -1
# copy all shared libs to lightgbm python package directory.
cp ./lib_custom_parser.so ${lgb_python_pkg_dir}/lightgbm && \
cp ./src/lib_transform.so ${lgb_python_pkg_dir}/lightgbm && \
cp ./external_libs/LightGBM/lib_lightgbm.so ${lgb_python_pkg_dir}/lightgbm || exit -1
# modify `basic.py` to load all libs first, or cannot find them when calling python interfaces.
cp ${lgb_python_pkg_dir}/lightgbm/basic.py raw && cat ./scripts/load_precompiled_libs.py ${lgb_python_pkg_dir}/lightgbm/basic.py > tmp && cp tmp ${lgb_python_pkg_dir}/lightgbm/basic.py || exit -1
# pack wheel package.
cd ${lgb_python_pkg_dir} && rm -rf dist/ && python setup.py bdist_wheel --precompile && cd ../../../ || exit -1
# revert changes
mv raw ${lgb_python_pkg_dir}/lightgbm/basic.py && rm -rf raw tmp ${lgb_python_pkg_dir}/lightgbm/*.so || exit -1
# upload package to your pypi, use testpypi as an example.
twine upload --repository testpypi ${lgb_python_pkg_dir}/dist/* || exit -1

Просмотреть файл

@ -48,41 +48,6 @@ set(LLVM_LIB
LLVMX86Utils
)
set(BOOST_LIB
boost_atomic
boost_chrono
boost_context
boost_coroutine
boost_date_time
boost_exception
boost_filesystem
boost_graph
boost_graph_parallel
boost_iostreams
boost_locale
boost_log
boost_log_setup
boost_math_c99
boost_math_c99f
boost_math_c99l
boost_math_tr1
boost_math_tr1f
boost_math_tr1l
boost_mpi
boost_prg_exec_monitor
boost_program_options
boost_random
boost_regex
boost_serialization
boost_system
boost_test_exec_monitor
boost_thread
boost_timer
boost_unit_test_framework
boost_wave
boost_wserialization
)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
if(USE_DEBUG)
SET(CMAKE_BUILD_TYPE "Debug")

Просмотреть файл

@ -45,7 +45,7 @@ target_link_libraries(${PROJECT_NAME}
DRFreeFormTransformLibrary
DRFreeFormSExpressionLibrary
DRFreeFormLibrary
${BOOST_LIB}
${Boost_LIBRARIES}
${LLVM_LIB}
-Wl,--end-group
)

Просмотреть файл

@ -46,7 +46,7 @@ target_link_libraries(${PROJECT_NAME}
DRFreeFormSExpressionLibrary
DRFreeFormLibrary
NeuralTreeEvaluatorLibrary
${BOOST_LIB}
${Boost_LIBRARIES}
${LLVM_LIB}
-Wl,--end-group
)

Просмотреть файл

@ -2,7 +2,9 @@
set(PROJECT_NAME TransformProcessor)
Project(${PROJECT_NAME})
project(${PROJECT_NAME})
set(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/../../)
add_library(_transform SHARED
IniFileParserInterface.h
@ -36,7 +38,7 @@ target_link_libraries(_transform
DRFreeFormSExpressionLibrary
DRFreeFormLibrary
NeuralTreeEvaluatorLibrary
${BOOST_LIB}
${Boost_LIBRARIES}
${LLVM_LIB}
-Wl,--end-group
)

Просмотреть файл

@ -39,7 +39,7 @@ target_link_libraries(${PROJECT_NAME}
DRFreeFormLibrary
NeuralTreeEvaluatorLibrary
_transform
${BOOST_LIB}
${Boost_LIBRARIES}
${LLVM_LIB}
-Wl,--end-group
)

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,33 @@
className:FreeForm2Parser
labelId:10
transform:
[Input:1]
Line1=(+ feature_1 feature_2)
Transform=FreeForm2
Slope=1
Intercept=0
[Input:2]
Transform=FreeForm2
Line1=(* feature_1 feature_3)
[Input:3]
Transform=FreeForm2
Line1=(max feature_6 feature_7)
[Input:4]
Transform=Linear
Name=feature_8
Intercept=0
Slope=1
[Input:5]
Transform=Linear
Name=feature_9
Intercept=0
Slope=1
end of transform
header:
feature_0 feature_1 feature_2 feature_3 feature_4 feature_5 feature_6 feature_7 feature_8 feature_9 labels
end of header

Просмотреть файл

@ -0,0 +1,100 @@
12 12 12 2 18 7 16 5 11 14 1
3 12 10 11 10 8 5 0 7 8 0
9 3 18 4 8 18 12 14 3 4 1
15 15 14 17 0 10 2 5 2 8 0
18 16 9 9 15 16 9 10 15 19 0
5 0 14 1 15 16 8 2 2 18 1
10 9 1 13 19 7 7 15 1 4 1
19 3 4 16 19 8 3 8 17 15 1
5 16 14 13 14 14 17 3 3 3 1
7 11 10 15 19 3 5 11 6 3 1
5 12 14 12 9 11 19 18 17 2 1
6 15 9 11 10 15 17 0 6 6 1
10 8 16 13 9 0 10 7 19 19 0
16 4 19 18 13 0 10 7 14 4 0
19 17 12 5 12 5 18 15 12 19 0
2 18 1 18 18 8 8 7 8 0 1
1 17 11 1 8 12 0 11 5 6 1
4 1 11 10 15 19 15 2 5 16 0
15 14 10 5 6 15 7 16 3 4 0
6 6 11 8 15 4 16 16 8 12 0
10 15 11 9 12 6 5 9 19 0 1
4 3 3 8 13 18 5 12 19 8 0
11 13 7 2 18 17 5 17 3 10 1
19 7 9 0 1 9 11 7 4 1 0
8 19 17 18 17 5 3 6 7 6 0
16 4 19 13 3 0 15 6 16 2 1
9 1 0 19 7 9 8 19 10 16 0
9 5 4 12 7 5 8 11 16 1 1
15 14 18 0 13 18 2 10 10 14 1
16 1 5 3 3 7 11 8 14 4 1
10 19 0 4 6 11 12 10 9 19 0
18 6 18 6 6 10 4 13 17 12 0
15 2 17 11 6 8 10 3 0 12 1
2 14 3 11 5 19 17 9 9 4 1
16 19 7 9 10 13 15 17 0 0 1
4 6 17 17 14 5 2 8 12 7 1
14 7 11 5 0 13 1 0 17 16 0
17 13 0 11 5 0 16 1 19 17 1
9 13 3 17 1 1 2 15 2 18 0
12 17 3 10 4 9 8 4 7 5 0
19 8 16 6 9 18 19 12 17 0 1
1 14 15 8 7 9 1 19 8 12 1
16 1 16 10 17 12 6 4 6 8 1
7 8 16 10 15 15 8 14 14 0 0
6 1 10 14 14 10 15 0 8 10 1
3 16 12 16 17 7 8 19 17 19 1
15 0 0 12 7 0 13 2 17 16 0
5 8 12 12 4 10 0 8 12 7 1
10 17 14 19 13 2 8 17 17 8 1
11 6 11 16 10 12 7 19 14 13 0
18 10 7 3 11 17 4 12 13 8 1
4 3 10 8 14 10 10 10 10 12 1
3 2 14 1 7 19 8 6 14 3 1
1 8 2 11 17 8 0 0 7 18 0
0 13 8 11 4 19 16 13 5 0 0
19 4 19 14 19 17 19 14 12 4 0
9 15 16 5 8 15 18 0 9 18 1
5 10 0 14 19 18 1 4 6 9 0
1 14 19 11 3 6 16 9 15 6 1
12 7 13 19 12 15 4 7 15 14 0
18 18 9 6 4 8 6 1 9 4 0
11 14 16 9 14 9 17 9 7 12 1
1 2 7 10 13 15 4 9 7 17 0
7 11 10 17 1 18 4 12 7 16 0
4 2 14 0 11 17 6 0 14 13 0
16 12 14 6 5 18 9 15 19 10 1
15 15 18 3 7 5 2 14 13 12 0
9 12 15 1 18 13 12 10 19 2 1
17 16 19 6 15 5 6 3 5 1 1
8 2 5 4 16 13 3 6 8 9 1
4 9 19 7 0 7 15 13 6 8 0
12 7 9 4 9 1 1 16 18 10 1
18 0 15 1 19 16 2 17 3 14 0
5 16 18 11 2 11 9 19 5 14 0
9 10 8 19 16 9 8 2 3 2 0
5 12 15 1 16 10 0 18 9 12 1
11 12 17 3 13 18 16 5 4 8 1
14 1 2 13 5 15 8 3 14 0 0
11 14 13 7 6 17 16 0 7 0 1
18 9 16 8 10 11 1 18 11 14 1
10 18 9 9 19 18 9 11 2 14 1
4 15 3 1 13 17 4 9 0 7 1
2 12 17 7 13 12 8 5 1 5 0
10 17 8 5 14 6 4 18 17 0 0
19 9 14 17 6 18 5 12 1 3 1
11 10 12 19 10 8 18 3 1 14 1
4 6 15 5 12 13 17 7 7 10 1
18 6 10 5 12 0 11 4 11 18 1
4 1 18 13 18 2 14 5 16 18 0
6 3 16 1 14 2 12 2 3 9 1
11 12 11 0 0 18 6 19 6 14 0
16 9 12 4 7 15 6 8 16 14 1
10 2 15 1 11 5 2 14 16 19 0
0 5 14 10 9 15 13 5 11 12 1
3 18 16 16 4 14 13 10 7 14 0
13 19 13 0 0 3 17 8 4 16 0
1 19 6 16 1 13 10 3 12 10 0
8 8 14 1 2 14 2 6 19 10 1
15 19 10 17 11 11 17 16 17 12 1
5 16 12 16 8 18 1 1 11 5 0
1 12 12 12 2 18 7 16 5 11 14 1
2 3 12 10 11 10 8 5 0 7 8 0
3 9 3 18 4 8 18 12 14 3 4 1
4 15 15 14 17 0 10 2 5 2 8 0
5 18 16 9 9 15 16 9 10 15 19 0
6 5 0 14 1 15 16 8 2 2 18 1
7 10 9 1 13 19 7 7 15 1 4 1
8 19 3 4 16 19 8 3 8 17 15 1
9 5 16 14 13 14 14 17 3 3 3 1
10 7 11 10 15 19 3 5 11 6 3 1
11 5 12 14 12 9 11 19 18 17 2 1
12 6 15 9 11 10 15 17 0 6 6 1
13 10 8 16 13 9 0 10 7 19 19 0
14 16 4 19 18 13 0 10 7 14 4 0
15 19 17 12 5 12 5 18 15 12 19 0
16 2 18 1 18 18 8 8 7 8 0 1
17 1 17 11 1 8 12 0 11 5 6 1
18 4 1 11 10 15 19 15 2 5 16 0
19 15 14 10 5 6 15 7 16 3 4 0
20 6 6 11 8 15 4 16 16 8 12 0
21 10 15 11 9 12 6 5 9 19 0 1
22 4 3 3 8 13 18 5 12 19 8 0
23 11 13 7 2 18 17 5 17 3 10 1
24 19 7 9 0 1 9 11 7 4 1 0
25 8 19 17 18 17 5 3 6 7 6 0
26 16 4 19 13 3 0 15 6 16 2 1
27 9 1 0 19 7 9 8 19 10 16 0
28 9 5 4 12 7 5 8 11 16 1 1
29 15 14 18 0 13 18 2 10 10 14 1
30 16 1 5 3 3 7 11 8 14 4 1
31 10 19 0 4 6 11 12 10 9 19 0
32 18 6 18 6 6 10 4 13 17 12 0
33 15 2 17 11 6 8 10 3 0 12 1
34 2 14 3 11 5 19 17 9 9 4 1
35 16 19 7 9 10 13 15 17 0 0 1
36 4 6 17 17 14 5 2 8 12 7 1
37 14 7 11 5 0 13 1 0 17 16 0
38 17 13 0 11 5 0 16 1 19 17 1
39 9 13 3 17 1 1 2 15 2 18 0
40 12 17 3 10 4 9 8 4 7 5 0
41 19 8 16 6 9 18 19 12 17 0 1
42 1 14 15 8 7 9 1 19 8 12 1
43 16 1 16 10 17 12 6 4 6 8 1
44 7 8 16 10 15 15 8 14 14 0 0
45 6 1 10 14 14 10 15 0 8 10 1
46 3 16 12 16 17 7 8 19 17 19 1
47 15 0 0 12 7 0 13 2 17 16 0
48 5 8 12 12 4 10 0 8 12 7 1
49 10 17 14 19 13 2 8 17 17 8 1
50 11 6 11 16 10 12 7 19 14 13 0
51 18 10 7 3 11 17 4 12 13 8 1
52 4 3 10 8 14 10 10 10 10 12 1
53 3 2 14 1 7 19 8 6 14 3 1
54 1 8 2 11 17 8 0 0 7 18 0
55 0 13 8 11 4 19 16 13 5 0 0
56 19 4 19 14 19 17 19 14 12 4 0
57 9 15 16 5 8 15 18 0 9 18 1
58 5 10 0 14 19 18 1 4 6 9 0
59 1 14 19 11 3 6 16 9 15 6 1
60 12 7 13 19 12 15 4 7 15 14 0
61 18 18 9 6 4 8 6 1 9 4 0
62 11 14 16 9 14 9 17 9 7 12 1
63 1 2 7 10 13 15 4 9 7 17 0
64 7 11 10 17 1 18 4 12 7 16 0
65 4 2 14 0 11 17 6 0 14 13 0
66 16 12 14 6 5 18 9 15 19 10 1
67 15 15 18 3 7 5 2 14 13 12 0
68 9 12 15 1 18 13 12 10 19 2 1
69 17 16 19 6 15 5 6 3 5 1 1
70 8 2 5 4 16 13 3 6 8 9 1
71 4 9 19 7 0 7 15 13 6 8 0
72 12 7 9 4 9 1 1 16 18 10 1
73 18 0 15 1 19 16 2 17 3 14 0
74 5 16 18 11 2 11 9 19 5 14 0
75 9 10 8 19 16 9 8 2 3 2 0
76 5 12 15 1 16 10 0 18 9 12 1
77 11 12 17 3 13 18 16 5 4 8 1
78 14 1 2 13 5 15 8 3 14 0 0
79 11 14 13 7 6 17 16 0 7 0 1
80 18 9 16 8 10 11 1 18 11 14 1
81 10 18 9 9 19 18 9 11 2 14 1
82 4 15 3 1 13 17 4 9 0 7 1
83 2 12 17 7 13 12 8 5 1 5 0
84 10 17 8 5 14 6 4 18 17 0 0
85 19 9 14 17 6 18 5 12 1 3 1
86 11 10 12 19 10 8 18 3 1 14 1
87 4 6 15 5 12 13 17 7 7 10 1
88 18 6 10 5 12 0 11 4 11 18 1
89 4 1 18 13 18 2 14 5 16 18 0
90 6 3 16 1 14 2 12 2 3 9 1
91 11 12 11 0 0 18 6 19 6 14 0
92 16 9 12 4 7 15 6 8 16 14 1
93 10 2 15 1 11 5 2 14 16 19 0
94 0 5 14 10 9 15 13 5 11 12 1
95 3 18 16 16 4 14 13 10 7 14 0
96 13 19 13 0 0 3 17 8 4 16 0
97 1 19 6 16 1 13 10 3 12 10 0
98 8 8 14 1 2 14 2 6 19 10 1
99 15 19 10 17 11 11 17 16 17 12 1
100 5 16 12 16 8 18 1 1 11 5 0