sep_string() now accepts multiple spaces in a row

2016-02-15 12:41:37 -08:00 · 2016-02-15 12:41:37 -08:00 · 828f4c28c0
--- a/Source/Common/Include/fileutil.h
+++ b/Source/Common/Include/fileutil.h
@ -600,8 +600,8 @@ static inline std::vector<std::string> fgetfilelines(const std::wstring& pathnam
    return lines;
 }
 std::vector<char*> fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer);
-};
-};
+
+}}

 #ifdef _WIN32
 // ----------------------------------------------------------------------------
@ -863,37 +863,39 @@ static inline bool relpath(const wchar_t* path)
    // ... TODO: handle long NT paths
    return true; // all others
 }
-template <class CHAR>
-static inline bool relpath(const std::basic_string<CHAR>& s)
+template <class Char>
+static inline bool relpath(const std::basic_string<Char>& s)
 {
    return relpath(s.c_str());
 }

 // trim from start
-template<class STRING>
-static inline STRING& ltrim(STRING& s)
+template<class String>
+static inline String& ltrim(String& s)
 {
-    s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](typename STRING::value_type c){ return !iscspace(c); }));
+    s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](typename String::value_type c){ return !iscspace(c); }));
    return s;
 }

 // trim from end
-template<class STRING>
-static inline STRING& rtrim(STRING& s)
+template<class String>
+static inline String& rtrim(String& s)
 {
-    s.erase(std::find_if(s.rbegin(), s.rend(), [](typename STRING::value_type c){ return !iscspace(c); }).base(), s.end());
+    s.erase(std::find_if(s.rbegin(), s.rend(), [](typename String::value_type c){ return !iscspace(c); }).base(), s.end());
    return s;
 }

 // trim from both ends
-template<class STRING>
-static inline STRING& trim(STRING& s)
+template<class String>
+static inline String& trim(String& s)
 {
    return ltrim(rtrim(s));
 }

-std::vector<std::string> sep_string(const std::string& str, const std::string& sep);
-std::vector<std::wstring> wsep_string(const std::wstring& str, const std::wstring& sep); // TODO: overload sep_string with wstring type, no need for different name
+template<class String>
+std::vector<String> SplitString(const String& str, const String& sep);
+template<class String, class Char>
+std::vector<String> SplitString(const String& str, const Char* sep) { return SplitString(str, String(sep)); }

 std::wstring s2ws(const std::string& str);

--- a/Source/Common/fileutil.cpp
+++ b/Source/Common/fileutil.cpp
@ -1903,54 +1903,35 @@ bool msra::files::fuptodate(const wstring& target, const wstring& input, bool in
    return targettime >= inputtime; // note: uses an overload for WIN32 FILETIME (in Linux, FILETIME=time_t=size_t)
 }

-/// separate string by separator
-vector<string> sep_string(const string& istr, const string& sep)
-{
-    string str = istr;
-    str = trim(str);
-    vector<string> vstr;
-    string csub;
-    size_t ifound = 0;
-    size_t ifoundlast = ifound;
-    ifound = str.find(sep, ifound);
-    while (ifound != std::string::npos)
-    {
-        csub = str.substr(ifoundlast, ifound - ifoundlast);
-        vstr.push_back(trim(csub));
-
-        ifoundlast = ifound + 1;
-        ifound = str.find(sep, ifoundlast);
-    }
-    csub = str.substr(ifoundlast, str.length() - ifoundlast);
-    vstr.push_back(trim(csub));
-
-    return vstr;
-}
-
 // separate string by separator
-// TODO: unify with above
-vector<wstring> wsep_string(const wstring& istr, const wstring& sep)
+template<class String>
+vector<String> SplitString(const String& str, const String& sep)
 {
-    wstring str = istr;
-    str = trim(str);
-    vector<wstring> vstr;
-    wstring csub;
+    vector<String> vstr;
+    String csub;
    size_t ifound = 0;
    size_t ifoundlast = ifound;
-    ifound = str.find(sep, ifound);
-    while (ifound != std::wstring::npos)
+    ifound = str.find_first_of(sep, ifound);
+    while (ifound != String::npos)
    {
        csub = str.substr(ifoundlast, ifound - ifoundlast);
-        vstr.push_back(trim(csub));
+        if (!csub.empty())
+            vstr.push_back(csub);

        ifoundlast = ifound + 1;
-        ifound = str.find(sep, ifoundlast);
+        ifound = str.find_first_of(sep, ifoundlast);
    }
-    csub = str.substr(ifoundlast, str.length() - ifoundlast);
-    vstr.push_back(trim(csub));
+    ifound = str.length();
+    csub = str.substr(ifoundlast, ifound - ifoundlast);
+    if (!csub.empty())
+        vstr.push_back(csub);

    return vstr;
 }
+
+template vector<string>  SplitString(const  string& istr, const  string& sep);
+template vector<wstring> SplitString(const wstring& istr, const wstring& sep);
+
 static inline std::string wcstombs(const std::wstring& p) // output: MBCS
 {
    size_t len = p.length();
--- a/Source/Readers/LMSequenceReader/SequenceParser.h
+++ b/Source/Readers/LMSequenceReader/SequenceParser.h
@ -585,7 +585,7 @@ public:
        while (labels->size() - orgRecordCount < recordsRequested && fgets(ch2, MAXSTRING, mFile) != nullptr)
        {
            ch.assign(ch2);
-            std::vector<string> vstr = sep_string(ch, " ");
+            std::vector<string> vstr = SplitString(ch, " \n\r\t");
            if (vstr.size() < 3)    // TODO: Document this special condition. Why should we not process empty sequences like <s> </s>?
                continue;

--- a/Source/Readers/LUSequenceReader/LUSequenceParser.cpp
+++ b/Source/Readers/LUSequenceReader/LUSequenceParser.cpp
@ -89,7 +89,7 @@ long BatchLUSequenceParser<NumType, LabelType>::Parse(size_t recordsRequested, s
        // got a token
        tokenCount++;

-        vstr = wsep_string(ch, L" ");
+        vstr = SplitString(ch, L" \n\r\t");
        if (vstr.size() < 2)
            continue;

--- a/Source/Readers/LUSequenceReader/LUSequenceReader.cpp
+++ b/Source/Readers/LUSequenceReader/LUSequenceReader.cpp
@ -59,7 +59,7 @@ void BatchLUSequenceReader<ElemType>::ReadLabelInfo(const wstring& vocfile,
    if (!vin.good())
        LogicError("LUSequenceReader cannot open %ls\n", vocfile.c_str());

-    wstring wstr = L" ";
+    const wstring wstr = L" \n\r\t";
    long b = 0;
    this->nwords = 0;
    int prevcls = -1;
@ -74,7 +74,7 @@ void BatchLUSequenceReader<ElemType>::ReadLabelInfo(const wstring& vocfile,
            break;
        if (readClass)
        {
-            vector<wstring> wordandcls = wsep_string(strtmp, wstr);
+            vector<wstring> wordandcls = SplitString(strtmp, wstr);
            long cls = _wtoi(wordandcls[1].c_str());
            word4cls[wordandcls[0]] = cls;

@ -1163,7 +1163,7 @@ void BatchLUSequenceReader<ElemType>::LoadWordMapping(const ConfigRecordType& re
            ss = trim(ss);
            if (ss.length() == 0)
                break;
-            vs = wsep_string(ss, L" ");
+            vs = SplitString(ss, L" \n\r\t");
            si = vs[0];
            so = vs[1];
            mWordMapping[si] = so;