sep_string() now accepts multiple spaces in a row

This commit is contained in:
Frank Seide 2016-02-15 12:41:37 -08:00
Родитель 343893e882
Коммит 828f4c28c0
5 изменённых файлов: 38 добавлений и 55 удалений

Просмотреть файл

@ -600,8 +600,8 @@ static inline std::vector<std::string> fgetfilelines(const std::wstring& pathnam
return lines;
}
std::vector<char*> fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer);
};
};
}}
#ifdef _WIN32
// ----------------------------------------------------------------------------
@ -863,37 +863,39 @@ static inline bool relpath(const wchar_t* path)
// ... TODO: handle long NT paths
return true; // all others
}
template <class CHAR>
static inline bool relpath(const std::basic_string<CHAR>& s)
template <class Char>
static inline bool relpath(const std::basic_string<Char>& s)
{
return relpath(s.c_str());
}
// trim from start
template<class STRING>
static inline STRING& ltrim(STRING& s)
template<class String>
static inline String& ltrim(String& s)
{
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](typename STRING::value_type c){ return !iscspace(c); }));
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](typename String::value_type c){ return !iscspace(c); }));
return s;
}
// trim from end
template<class STRING>
static inline STRING& rtrim(STRING& s)
template<class String>
static inline String& rtrim(String& s)
{
s.erase(std::find_if(s.rbegin(), s.rend(), [](typename STRING::value_type c){ return !iscspace(c); }).base(), s.end());
s.erase(std::find_if(s.rbegin(), s.rend(), [](typename String::value_type c){ return !iscspace(c); }).base(), s.end());
return s;
}
// trim from both ends
template<class STRING>
static inline STRING& trim(STRING& s)
template<class String>
static inline String& trim(String& s)
{
return ltrim(rtrim(s));
}
std::vector<std::string> sep_string(const std::string& str, const std::string& sep);
std::vector<std::wstring> wsep_string(const std::wstring& str, const std::wstring& sep); // TODO: overload sep_string with wstring type, no need for different name
template<class String>
std::vector<String> SplitString(const String& str, const String& sep);
template<class String, class Char>
std::vector<String> SplitString(const String& str, const Char* sep) { return SplitString(str, String(sep)); }
std::wstring s2ws(const std::string& str);

Просмотреть файл

@ -1903,54 +1903,35 @@ bool msra::files::fuptodate(const wstring& target, const wstring& input, bool in
return targettime >= inputtime; // note: uses an overload for WIN32 FILETIME (in Linux, FILETIME=time_t=size_t)
}
/// separate string by separator
vector<string> sep_string(const string& istr, const string& sep)
{
string str = istr;
str = trim(str);
vector<string> vstr;
string csub;
size_t ifound = 0;
size_t ifoundlast = ifound;
ifound = str.find(sep, ifound);
while (ifound != std::string::npos)
{
csub = str.substr(ifoundlast, ifound - ifoundlast);
vstr.push_back(trim(csub));
ifoundlast = ifound + 1;
ifound = str.find(sep, ifoundlast);
}
csub = str.substr(ifoundlast, str.length() - ifoundlast);
vstr.push_back(trim(csub));
return vstr;
}
// separate string by separator
// TODO: unify with above
vector<wstring> wsep_string(const wstring& istr, const wstring& sep)
template<class String>
vector<String> SplitString(const String& str, const String& sep)
{
wstring str = istr;
str = trim(str);
vector<wstring> vstr;
wstring csub;
vector<String> vstr;
String csub;
size_t ifound = 0;
size_t ifoundlast = ifound;
ifound = str.find(sep, ifound);
while (ifound != std::wstring::npos)
ifound = str.find_first_of(sep, ifound);
while (ifound != String::npos)
{
csub = str.substr(ifoundlast, ifound - ifoundlast);
vstr.push_back(trim(csub));
if (!csub.empty())
vstr.push_back(csub);
ifoundlast = ifound + 1;
ifound = str.find(sep, ifoundlast);
ifound = str.find_first_of(sep, ifoundlast);
}
csub = str.substr(ifoundlast, str.length() - ifoundlast);
vstr.push_back(trim(csub));
ifound = str.length();
csub = str.substr(ifoundlast, ifound - ifoundlast);
if (!csub.empty())
vstr.push_back(csub);
return vstr;
}
template vector<string> SplitString(const string& istr, const string& sep);
template vector<wstring> SplitString(const wstring& istr, const wstring& sep);
static inline std::string wcstombs(const std::wstring& p) // output: MBCS
{
size_t len = p.length();

Просмотреть файл

@ -585,7 +585,7 @@ public:
while (labels->size() - orgRecordCount < recordsRequested && fgets(ch2, MAXSTRING, mFile) != nullptr)
{
ch.assign(ch2);
std::vector<string> vstr = sep_string(ch, " ");
std::vector<string> vstr = SplitString(ch, " \n\r\t");
if (vstr.size() < 3) // TODO: Document this special condition. Why should we not process empty sequences like <s> </s>?
continue;

Просмотреть файл

@ -89,7 +89,7 @@ long BatchLUSequenceParser<NumType, LabelType>::Parse(size_t recordsRequested, s
// got a token
tokenCount++;
vstr = wsep_string(ch, L" ");
vstr = SplitString(ch, L" \n\r\t");
if (vstr.size() < 2)
continue;

Просмотреть файл

@ -59,7 +59,7 @@ void BatchLUSequenceReader<ElemType>::ReadLabelInfo(const wstring& vocfile,
if (!vin.good())
LogicError("LUSequenceReader cannot open %ls\n", vocfile.c_str());
wstring wstr = L" ";
const wstring wstr = L" \n\r\t";
long b = 0;
this->nwords = 0;
int prevcls = -1;
@ -74,7 +74,7 @@ void BatchLUSequenceReader<ElemType>::ReadLabelInfo(const wstring& vocfile,
break;
if (readClass)
{
vector<wstring> wordandcls = wsep_string(strtmp, wstr);
vector<wstring> wordandcls = SplitString(strtmp, wstr);
long cls = _wtoi(wordandcls[1].c_str());
word4cls[wordandcls[0]] = cls;
@ -1163,7 +1163,7 @@ void BatchLUSequenceReader<ElemType>::LoadWordMapping(const ConfigRecordType& re
ss = trim(ss);
if (ss.length() == 0)
break;
vs = wsep_string(ss, L" ");
vs = SplitString(ss, L" \n\r\t");
si = vs[0];
so = vs[1];
mWordMapping[si] = so;