sep_string() now accepts multiple spaces in a row
This commit is contained in:
Родитель
343893e882
Коммит
828f4c28c0
|
@ -600,8 +600,8 @@ static inline std::vector<std::string> fgetfilelines(const std::wstring& pathnam
|
|||
return lines;
|
||||
}
|
||||
std::vector<char*> fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer);
|
||||
};
|
||||
};
|
||||
|
||||
}}
|
||||
|
||||
#ifdef _WIN32
|
||||
// ----------------------------------------------------------------------------
|
||||
|
@ -863,37 +863,39 @@ static inline bool relpath(const wchar_t* path)
|
|||
// ... TODO: handle long NT paths
|
||||
return true; // all others
|
||||
}
|
||||
template <class CHAR>
|
||||
static inline bool relpath(const std::basic_string<CHAR>& s)
|
||||
template <class Char>
|
||||
static inline bool relpath(const std::basic_string<Char>& s)
|
||||
{
|
||||
return relpath(s.c_str());
|
||||
}
|
||||
|
||||
// trim from start
|
||||
template<class STRING>
|
||||
static inline STRING& ltrim(STRING& s)
|
||||
template<class String>
|
||||
static inline String& ltrim(String& s)
|
||||
{
|
||||
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](typename STRING::value_type c){ return !iscspace(c); }));
|
||||
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](typename String::value_type c){ return !iscspace(c); }));
|
||||
return s;
|
||||
}
|
||||
|
||||
// trim from end
|
||||
template<class STRING>
|
||||
static inline STRING& rtrim(STRING& s)
|
||||
template<class String>
|
||||
static inline String& rtrim(String& s)
|
||||
{
|
||||
s.erase(std::find_if(s.rbegin(), s.rend(), [](typename STRING::value_type c){ return !iscspace(c); }).base(), s.end());
|
||||
s.erase(std::find_if(s.rbegin(), s.rend(), [](typename String::value_type c){ return !iscspace(c); }).base(), s.end());
|
||||
return s;
|
||||
}
|
||||
|
||||
// trim from both ends
|
||||
template<class STRING>
|
||||
static inline STRING& trim(STRING& s)
|
||||
template<class String>
|
||||
static inline String& trim(String& s)
|
||||
{
|
||||
return ltrim(rtrim(s));
|
||||
}
|
||||
|
||||
std::vector<std::string> sep_string(const std::string& str, const std::string& sep);
|
||||
std::vector<std::wstring> wsep_string(const std::wstring& str, const std::wstring& sep); // TODO: overload sep_string with wstring type, no need for different name
|
||||
template<class String>
|
||||
std::vector<String> SplitString(const String& str, const String& sep);
|
||||
template<class String, class Char>
|
||||
std::vector<String> SplitString(const String& str, const Char* sep) { return SplitString(str, String(sep)); }
|
||||
|
||||
std::wstring s2ws(const std::string& str);
|
||||
|
||||
|
|
|
@ -1903,54 +1903,35 @@ bool msra::files::fuptodate(const wstring& target, const wstring& input, bool in
|
|||
return targettime >= inputtime; // note: uses an overload for WIN32 FILETIME (in Linux, FILETIME=time_t=size_t)
|
||||
}
|
||||
|
||||
/// separate string by separator
|
||||
vector<string> sep_string(const string& istr, const string& sep)
|
||||
{
|
||||
string str = istr;
|
||||
str = trim(str);
|
||||
vector<string> vstr;
|
||||
string csub;
|
||||
size_t ifound = 0;
|
||||
size_t ifoundlast = ifound;
|
||||
ifound = str.find(sep, ifound);
|
||||
while (ifound != std::string::npos)
|
||||
{
|
||||
csub = str.substr(ifoundlast, ifound - ifoundlast);
|
||||
vstr.push_back(trim(csub));
|
||||
|
||||
ifoundlast = ifound + 1;
|
||||
ifound = str.find(sep, ifoundlast);
|
||||
}
|
||||
csub = str.substr(ifoundlast, str.length() - ifoundlast);
|
||||
vstr.push_back(trim(csub));
|
||||
|
||||
return vstr;
|
||||
}
|
||||
|
||||
// separate string by separator
|
||||
// TODO: unify with above
|
||||
vector<wstring> wsep_string(const wstring& istr, const wstring& sep)
|
||||
template<class String>
|
||||
vector<String> SplitString(const String& str, const String& sep)
|
||||
{
|
||||
wstring str = istr;
|
||||
str = trim(str);
|
||||
vector<wstring> vstr;
|
||||
wstring csub;
|
||||
vector<String> vstr;
|
||||
String csub;
|
||||
size_t ifound = 0;
|
||||
size_t ifoundlast = ifound;
|
||||
ifound = str.find(sep, ifound);
|
||||
while (ifound != std::wstring::npos)
|
||||
ifound = str.find_first_of(sep, ifound);
|
||||
while (ifound != String::npos)
|
||||
{
|
||||
csub = str.substr(ifoundlast, ifound - ifoundlast);
|
||||
vstr.push_back(trim(csub));
|
||||
if (!csub.empty())
|
||||
vstr.push_back(csub);
|
||||
|
||||
ifoundlast = ifound + 1;
|
||||
ifound = str.find(sep, ifoundlast);
|
||||
ifound = str.find_first_of(sep, ifoundlast);
|
||||
}
|
||||
csub = str.substr(ifoundlast, str.length() - ifoundlast);
|
||||
vstr.push_back(trim(csub));
|
||||
ifound = str.length();
|
||||
csub = str.substr(ifoundlast, ifound - ifoundlast);
|
||||
if (!csub.empty())
|
||||
vstr.push_back(csub);
|
||||
|
||||
return vstr;
|
||||
}
|
||||
|
||||
template vector<string> SplitString(const string& istr, const string& sep);
|
||||
template vector<wstring> SplitString(const wstring& istr, const wstring& sep);
|
||||
|
||||
static inline std::string wcstombs(const std::wstring& p) // output: MBCS
|
||||
{
|
||||
size_t len = p.length();
|
||||
|
|
|
@ -585,7 +585,7 @@ public:
|
|||
while (labels->size() - orgRecordCount < recordsRequested && fgets(ch2, MAXSTRING, mFile) != nullptr)
|
||||
{
|
||||
ch.assign(ch2);
|
||||
std::vector<string> vstr = sep_string(ch, " ");
|
||||
std::vector<string> vstr = SplitString(ch, " \n\r\t");
|
||||
if (vstr.size() < 3) // TODO: Document this special condition. Why should we not process empty sequences like <s> </s>?
|
||||
continue;
|
||||
|
||||
|
|
|
@ -89,7 +89,7 @@ long BatchLUSequenceParser<NumType, LabelType>::Parse(size_t recordsRequested, s
|
|||
// got a token
|
||||
tokenCount++;
|
||||
|
||||
vstr = wsep_string(ch, L" ");
|
||||
vstr = SplitString(ch, L" \n\r\t");
|
||||
if (vstr.size() < 2)
|
||||
continue;
|
||||
|
||||
|
|
|
@ -59,7 +59,7 @@ void BatchLUSequenceReader<ElemType>::ReadLabelInfo(const wstring& vocfile,
|
|||
if (!vin.good())
|
||||
LogicError("LUSequenceReader cannot open %ls\n", vocfile.c_str());
|
||||
|
||||
wstring wstr = L" ";
|
||||
const wstring wstr = L" \n\r\t";
|
||||
long b = 0;
|
||||
this->nwords = 0;
|
||||
int prevcls = -1;
|
||||
|
@ -74,7 +74,7 @@ void BatchLUSequenceReader<ElemType>::ReadLabelInfo(const wstring& vocfile,
|
|||
break;
|
||||
if (readClass)
|
||||
{
|
||||
vector<wstring> wordandcls = wsep_string(strtmp, wstr);
|
||||
vector<wstring> wordandcls = SplitString(strtmp, wstr);
|
||||
long cls = _wtoi(wordandcls[1].c_str());
|
||||
word4cls[wordandcls[0]] = cls;
|
||||
|
||||
|
@ -1163,7 +1163,7 @@ void BatchLUSequenceReader<ElemType>::LoadWordMapping(const ConfigRecordType& re
|
|||
ss = trim(ss);
|
||||
if (ss.length() == 0)
|
||||
break;
|
||||
vs = wsep_string(ss, L" ");
|
||||
vs = SplitString(ss, L" \n\r\t");
|
||||
si = vs[0];
|
||||
so = vs[1];
|
||||
mWordMapping[si] = so;
|
||||
|
|
Загрузка…
Ссылка в новой задаче