pjs/xpcom/ds/nsStringTokenizer.h

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/*
 * The contents of this file are subject to the Netscape Public License
 * Version 1.0 (the "NPL"); you may not use this file except in
 * compliance with the NPL.  You may obtain a copy of the NPL at
 * http://www.mozilla.org/NPL/
 *
 * Software distributed under the NPL is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
 * for the specific language governing rights and limitations under the
 * NPL.
 *
 * The Initial Developer of this code under the NPL is Netscape
 * Communications Corporation.  Portions created by Netscape are
 * Copyright (C) 1998 Netscape Communications Corporation.  All Rights
 * Reserved.
 */

/**
 * MODULE NOTES:
 * @update  gess 4/1/98
 * 
 * This class knows how to read delimited data from a string.
 * Here are the 2 things you need to know to use this class effectively:
 *
 * ================================================
 * How To Setup The Tokenizer
 * ================================================
 *
 * The input charset can be either constrained or uncontrained. Constrained means
 * that you've chosen to allow only certain chars into your tokens. Unconstrained
 * means that any char (other than delimiters) are legal in your tokens.
 * If you want unconstrained input, use [*-*] your dataspec. To contrain your token
 * charset, you set ranges or single chars in the dataspec like this:
 *    "abc[0-9]"  -- which allow numbers and the letters a,b,c
 *
 *  Dataspecifier rules:
 *    abc   -- allows a set of characters
 *    [a-z] -- allows all chars in given range
 *    [*-*] -- allows all characters          
 *    ^abc  -- disallows a set of characters            //NOT_YET_IMPLEMENTED
 *    [a^z] -- disallows all characters in given range  //NOT_YET_IMPLEMENTED
 *    [a*b] -- specifies a delimiter pair for the entire token
 *    [a+b] -- specifies a delimiter pair for substrings in the token
 *
 * One other note: there is an optional argument called allowQuoting, which tells
 * the tokenizer whether to allow quoted strings within your fields. If you set this
 * to TRUE, then we allow nested quoted strings, which themselves can contain any data.
 * It's considered an error to set allowQuoting=TRUE and use a quote as a token or record delimiter.
 *
 * The other thing you need to set up for the tokenizer to work correctly are the delimiters.
 * They seperate fields and records, and be different. You can also have more than one kind
 * of delimiter for each. The distinguishment between tokens are records allows the caller
 * to deal with multi-line text files (where \n is the record seperator). Again, you don't have
 * to have a record seperator if it doesn't make sense in the context of your input dataset.
 *
 *
 * ================================================
 * How To Iterate Tokens
 * ================================================
 *
 * There are 2 ways to iterate tokens, either manually or automatically.
 * The manual method requires that you call a set of methods in the right order,
 * but gives you slightly more control. Here's the calling pattern:
 *
 * {
 *    nsString theBuffer("xxxxxxx");
 *    nsStringTokenizer tok(...);
 *    tok.SetBuffer(theBuffer);
 *    tok.FirstRecord(); 
 *    while(tok.HasNextToken()){
 *      while(tok.HasNextToken()){
 *        nsAutoString theToken;
 *        tok.GetNextToken(theToken);
 *        //do something with your token here...
 *      } //while
 *      tok.NextRecord();
 *    } //while
 *  }
 *
 * The automatic method handles all the iteration for you. You provide a callback functor
 * and you'll get called once for each token per record. To use that technique, you need
 * to define an object that provides the ITokenizeFunctor interface (1 method). Then
 * call the tokenizer method Iterate(...). Voila.
 *
 */


#ifndef nsStringTokenizer_
#define nsStringTokenizer_

#include "nsString.h"

class ITokenizeFunctor {
public:
  virtual operator ()(nsString& aToken,PRInt32 aRecordCount,PRInt32 aTokenCount)=0;
};

class nsStringTokenizer {
public:
          nsStringTokenizer(const char* aFieldSep=",",const char* aRecordSep="\n");
          ~nsStringTokenizer();


    //Call these methods if you want to iterate the tokens yourself                
  void    SetBuffer(nsString& aBuffer);
  void    AddTokenSpec(const char* aTokenSpec="");
  PRBool  FirstRecord(void);
  PRBool  NextRecord(void);
  PRBool  HasNextToken(void);
  PRInt32 GetNextToken(nsString& aToken);

    //Call this one (exclusively) if you want to be called back iteratively
  PRInt32 Iterate(nsString& aBuffer,ITokenizeFunctor& aFunctor);

protected:

  enum	eCharTypes    {eUnknown,eDataChar,eFieldSeparator,eDataDelimiter,eRecordSeparator};
  enum  eCharSpec     {eGivenChars,eAllChars,eExceptChars};

  PRInt32         SkipOver(nsString& aSkipSet);
  PRInt32         SkipOver(PRUnichar  aSkipChar);
  PRInt32         ReadUntil(nsString& aString,nsString& aTermSet,PRBool aState);
  PRInt32         ReadUntil(nsString& aString,PRUnichar aChar,PRBool aState);
  PRBool          More(void);
  PRInt32         GetChar(PRUnichar& aChar);
  void            UnGetChar(PRUnichar aChar);
  PRBool          SkipToValidData(void);
  void            ExpandDataSpecifier(const char* aDataSpec) ;
  inline PRBool   IsValidDataChar(PRUnichar aChar);
  eCharTypes      DetermineCharType(PRUnichar aChar);

  PRInt32         mValidChars[4];
  PRInt32         mInvalidChars[4];
  nsString        mDataStartDelimiter;
  nsString        mDataEndDelimiter;
  nsString        mSubstrStartDelimiter;
  nsString        mSubstrEndDelimiter;
  nsString        mFieldSeparator;
  nsString        mRecordSeparator;
  PRInt32         mOffset;
  eCharSpec       mCharSpec;
  nsString*       mBuffer;
};

#endif
back out erroneously updated files 1999-07-09 10:01:55 +04:00			`/* -- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -- */`
			`/*`
			`* The contents of this file are subject to the Netscape Public License`
			`* Version 1.0 (the "NPL"); you may not use this file except in`
			`* compliance with the NPL. You may obtain a copy of the NPL at`
			`* http://www.mozilla.org/NPL/`
			`*`
			`* Software distributed under the NPL is distributed on an "AS IS" basis,`
			`* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL`
			`* for the specific language governing rights and limitations under the`
			`* NPL.`
			`*`
			`* The Initial Developer of this code under the NPL is Netscape`
			`* Communications Corporation. Portions created by Netscape are`
			`* Copyright (C) 1998 Netscape Communications Corporation. All Rights`
			`* Reserved.`
			`*/`

			`/**`
			`* MODULE NOTES:`
			`* @update gess 4/1/98`
			`*`
			`* This class knows how to read delimited data from a string.`
			`* Here are the 2 things you need to know to use this class effectively:`
			`*`
			`* ================================================`
			`* How To Setup The Tokenizer`
			`* ================================================`
			`*`
			`* The input charset can be either constrained or uncontrained. Constrained means`
			`* that you've chosen to allow only certain chars into your tokens. Unconstrained`
			`* means that any char (other than delimiters) are legal in your tokens.`
			`* If you want unconstrained input, use [-] your dataspec. To contrain your token`
			`* charset, you set ranges or single chars in the dataspec like this:`
			`* "abc[0-9]" -- which allow numbers and the letters a,b,c`
			`*`
			`* Dataspecifier rules:`
			`* abc -- allows a set of characters`
			`* [a-z] -- allows all chars in given range`
			`* [-] -- allows all characters`
			`* ^abc -- disallows a set of characters //NOT_YET_IMPLEMENTED`
			`* [a^z] -- disallows all characters in given range //NOT_YET_IMPLEMENTED`
			`* [a*b] -- specifies a delimiter pair for the entire token`
			`* [a+b] -- specifies a delimiter pair for substrings in the token`
			`*`
			`* One other note: there is an optional argument called allowQuoting, which tells`
			`* the tokenizer whether to allow quoted strings within your fields. If you set this`
			`* to TRUE, then we allow nested quoted strings, which themselves can contain any data.`
			`* It's considered an error to set allowQuoting=TRUE and use a quote as a token or record delimiter.`
			`*`
			`* The other thing you need to set up for the tokenizer to work correctly are the delimiters.`
			`* They seperate fields and records, and be different. You can also have more than one kind`
			`* of delimiter for each. The distinguishment between tokens are records allows the caller`
			`* to deal with multi-line text files (where \n is the record seperator). Again, you don't have`
			`* to have a record seperator if it doesn't make sense in the context of your input dataset.`
			`*`
			`*`
			`* ================================================`
			`* How To Iterate Tokens`
			`* ================================================`
			`*`
			`* There are 2 ways to iterate tokens, either manually or automatically.`
			`* The manual method requires that you call a set of methods in the right order,`
			`* but gives you slightly more control. Here's the calling pattern:`
			`*`
			`* {`
			`* nsString theBuffer("xxxxxxx");`
			`* nsStringTokenizer tok(...);`
			`* tok.SetBuffer(theBuffer);`
			`* tok.FirstRecord();`
			`* while(tok.HasNextToken()){`
			`* while(tok.HasNextToken()){`
			`* nsAutoString theToken;`
			`* tok.GetNextToken(theToken);`
			`* //do something with your token here...`
			`* } //while`
			`* tok.NextRecord();`
			`* } //while`
			`* }`
			`*`
			`* The automatic method handles all the iteration for you. You provide a callback functor`
			`* and you'll get called once for each token per record. To use that technique, you need`
			`* to define an object that provides the ITokenizeFunctor interface (1 method). Then`
			`* call the tokenizer method Iterate(...). Voila.`
			`*`
			`*/`


			`#ifndef nsStringTokenizer_`
			`#define nsStringTokenizer_`

			`#include "nsString.h"`

			`class ITokenizeFunctor {`
			`public:`
			`virtual operator ()(nsString& aToken,PRInt32 aRecordCount,PRInt32 aTokenCount)=0;`
			`};`

			`class nsStringTokenizer {`
			`public:`
update to new nsStringTokenizer -- not in tip 1999-07-09 20:51:57 +04:00			`nsStringTokenizer(const char* aFieldSep=",",const char* aRecordSep="\n");`
back out erroneously updated files 1999-07-09 10:01:55 +04:00			`~nsStringTokenizer();`


			`//Call these methods if you want to iterate the tokens yourself`
			`void SetBuffer(nsString& aBuffer);`
update to new nsStringTokenizer -- not in tip 1999-07-09 20:51:57 +04:00			`void AddTokenSpec(const char* aTokenSpec="");`
back out erroneously updated files 1999-07-09 10:01:55 +04:00			`PRBool FirstRecord(void);`
			`PRBool NextRecord(void);`
			`PRBool HasNextToken(void);`
			`PRInt32 GetNextToken(nsString& aToken);`

			`//Call this one (exclusively) if you want to be called back iteratively`
			`PRInt32 Iterate(nsString& aBuffer,ITokenizeFunctor& aFunctor);`

			`protected:`

			`enum eCharTypes {eUnknown,eDataChar,eFieldSeparator,eDataDelimiter,eRecordSeparator};`
			`enum eCharSpec {eGivenChars,eAllChars,eExceptChars};`

			`PRInt32 SkipOver(nsString& aSkipSet);`
			`PRInt32 SkipOver(PRUnichar aSkipChar);`
improvements to stringtokenizer 1999-07-15 10:40:44 +04:00			`PRInt32 ReadUntil(nsString& aString,nsString& aTermSet,PRBool aState);`
back out erroneously updated files 1999-07-09 10:01:55 +04:00			`PRInt32 ReadUntil(nsString& aString,PRUnichar aChar,PRBool aState);`
			`PRBool More(void);`
			`PRInt32 GetChar(PRUnichar& aChar);`
			`void UnGetChar(PRUnichar aChar);`
			`PRBool SkipToValidData(void);`
			`void ExpandDataSpecifier(const char* aDataSpec) ;`
			`inline PRBool IsValidDataChar(PRUnichar aChar);`
			`eCharTypes DetermineCharType(PRUnichar aChar);`

			`PRInt32 mValidChars[4];`
			`PRInt32 mInvalidChars[4];`
			`nsString mDataStartDelimiter;`
			`nsString mDataEndDelimiter;`
			`nsString mSubstrStartDelimiter;`
			`nsString mSubstrEndDelimiter;`
			`nsString mFieldSeparator;`
			`nsString mRecordSeparator;`
			`PRInt32 mOffset;`
			`eCharSpec mCharSpec;`
			`nsString* mBuffer;`
			`};`

			`#endif`