// Copyright (C) 1999-2005 Open Source Telecom Corporation.
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//
// As a special exception, you may use this file as part of a free software
// library without restriction.  Specifically, if other files instantiate
// templates or use macros or inline functions from this file, or you compile
// this file and link it with other files to produce an executable, this
// file does not by itself cause the resulting executable to be covered by
// the GNU General Public License.  This exception does not however
// invalidate any other reasons why the executable file might be covered by
// the GNU General Public License.
//
// This exception applies only to the code released under the name GNU
// Common C++.  If you copy code from other releases into a copy of GNU
// Common C++, as the General Public License permits, the exception does
// not apply to the code that you add in this way.  To avoid misleading
// anyone as to the status of such modified files, you must delete
// this exception notice from them.
//
// If you write modifications of your own for GNU Common C++, it is your choice
// whether to permit this exception to apply to your modifications.
// If you do not wish that, delete this exception notice.
//

/**
 * @file tokenizer.h
 * @short string tokenizer.
 **/

#ifndef	CCXX_TOKENIZER_H_
#define	CCXX_TOKENIZER_H_

#ifndef CCXX_MISSING_H_
#include <cc++/missing.h>
#endif

#ifndef CCXX_THREAD_H_
#include <cc++/thread.h>
#endif

#ifdef	CCXX_NAMESPACES
namespace ost {
#endif

/**
 * Splits delimited string into tokens.
 *
 * The StringTokenizer takes a pointer to a string and a pointer
 * to a string containing a number of possible delimiters.
 * The StringTokenizer provides an input forward iterator which allows
 * to iterate through all tokens. An iterator behaves like a logical
 * pointer to the tokens, i.e. to shift to the next token, you've
 * to increment the iterator, you get the token by dereferencing the
 * iterator.
 *
 * Memory consumption:
 * This class operates on the original string and only allocates memory
 * for the individual tokens actually requested, so this class
 * allocates at maximum the space required for the longest token in the
 * given string.
 * Since for each iteration, memory is reclaimed for the last token,
 * you MAY NOT store pointers to them; if you need them afterwards,
 * copy them. You may not modify the original string while you operate
 * on it with the StringTokenizer; the behaviour is undefined in that
 * case.
 *
 * The iterator has one special method 'nextDelimiter()' which returns
 * a character containing the next delimiter following this
 * tokenization process or '\\0', if there are no following delimiters. In
 * case of skipAllDelim, it returns the FIRST delimiter.
 *
 * With the method 'setDelimiters(const char*)' you may change the
 * set of delimiters. It affects all running iterators.
 *
 * Example:
 * <code><pre>
 *  StringTokenizer st("mary had a little lamb;its fleece was..", " ;");
 *  StringTokenizer::iterator i;
 *  for (i = st.begin() ; i != st.end() ; ++i) {
 *        cout << "Token: '" << *i << "'\t";
 *        cout << " next Delim: '" << i.nextDelimiter() << "'" << endl;
 *  }
 *  </pre></code>
 *
 * @author Henner Zeller <H.Zeller@acm.org>
 * @license LGPL
 */
class __EXPORT StringTokenizer {
public:
	/**
	 * a delimiter string containing all usual whitespace delimiters.
	 * These are space, tab, newline, carriage return,
	 * formfeed and vertical tab. (see isspace() manpage).
	 */
	static const char * const SPACE;

	/**
	 * Exception thrown, if someone tried to read beyond the
	 * end of the tokens.
	 * Will not happen if you use it the 'clean' way with comparison
	 * against end(), but if you skip some tokens, because you 'know'
	 * they are there. Simplifies error handling a lot, since you can
	 * just read your tokens the way you expect it, and if there is some
	 * error in the input this Exception will be thrown.
	 */
	// maybe move more global ?
	class NoSuchElementException { };

	/**
	 * The input forward iterator for tokens.
	 * @author Henner Zeller
	 */
	class __EXPORT iterator {
		friend class StringTokenizer;  // access our private constructors
	private:
		const StringTokenizer *myTok; // my StringTokenizer
		const char *start;      // start of current token
		const char *tokEnd;     // end of current token (->nxDelimiter)
		const char *endp;       // one before next token
		char *token;            // allocated token, if requested

		// for initialization of the itEnd iterator
		iterator(const StringTokenizer &tok, const char *end)
			: myTok(&tok),tokEnd(0),endp(end),token(0) {}

		iterator(const StringTokenizer &tok)
			: myTok(&tok),tokEnd(0),endp(myTok->str-1),token(0) {
			++(*this); // init first token.
		}

	public:
		iterator() : myTok(0),start(0),tokEnd(0),endp(0),token(0) {}

		// see also: comment in implementation of operator++
		virtual ~iterator() 
			{ if (token) *token='\0'; delete [] token; }

		/**
		 * copy constructor.
		 */
		// everything, but not responsible for the allocated token.
		iterator(const iterator& i) :
			myTok(i.myTok),start(i.start),tokEnd(i.tokEnd),
			endp(i.endp),token(0) {}

		/**
		 * assignment operator.
		 */
		// everything, but not responsible for the allocated token.
		iterator &operator = (const iterator &i) 
		{
			myTok = i.myTok;
			start = i.start; endp = i.endp; tokEnd = i.tokEnd;
			if ( token )
				delete [] token;
			token = 0;
			return *this;
		}

		/**
		 * shifts this iterator to the next token in the string.
		 */
		iterator &operator ++ () THROWS (NoSuchElementException);

		/**
		 * returns the immutable string this iterator
		 * points to or '0' if no token is available (i.e.
		 * i == end()).
		 * Do not store pointers to this token, since it is
		 * invalidated for each iteration. If you need the token,
		 * copy it (e.g. with strdup());
		 */
		const char*  operator *  () THROWS (NoSuchElementException);

		/**
		 * returns the next delimiter after the current token or
		 * '\\0', if there are no following delimiters.
		 * It returns the very next delimiter (even if
		 * skipAllDelim=true).
		 */
		inline char nextDelimiter() const 
			{return (tokEnd) ? *tokEnd : '\0';}

		/**
		 * compares to other iterator. Usually used to
		 * compare against the end() iterator.
		 */
		// only compare the end-position. speed.
		inline bool operator == (const iterator &other) const 
			{return (endp == other.endp);}

		/**
		 * compares to other iterator. Usually used to
		 * compare against the end() iterator.
		 */
		// only compare the end position. speed.
		inline bool operator != (const iterator &other) const 
			{return (endp != other.endp);}
	};
private:
	friend class StringTokenizer::iterator;
	const char *str;
	const char *delim;
	bool skipAll, trim;
	iterator itEnd;

public:
	/**
	 * creates a new StringTokenizer for a string
	 * and a given set of delimiters.
	 *
	 * @param  str          String to be split up. This string will
	 *                      not be modified by this StringTokenizer,
	 *                      but you may as well not modfiy this string
	 *                      while tokenizing is in process, which may
	 *                      lead to undefined behaviour.
	 *
	 * @param  delim        String containing the characters
	 *                      which should be regarded as delimiters.
	 *
	 * @param  skipAllDelim OPTIONAL.
	 *                      true, if subsequent
	 *                      delimiters should be skipped at once
	 *                      or false, if empty tokens should
	 *                      be returned for two delimiters with
	 *                      no other text inbetween. The first
	 *                      behaviour may be desirable for whitespace
	 *                      skipping, the second for input with
	 *                      delimited entry e.g. /etc/passwd like files
	 *                      or CSV input.
	 *                      NOTE, that 'true' here resembles the
	 *                      ANSI-C strtok(char *s,char *d) behaviour.
	 *                      DEFAULT = false
	 *
	 * @param trim          OPTIONAL.
	 *                      true, if the tokens returned
	 *                      should be trimmed, so that they don't have
	 *                      any whitespaces at the beginning or end.
	 *                      Whitespaces are any of the characters
	 *                      defined in StringTokenizer::SPACE.
	 *                      If delim itself is StringTokenizer::SPACE,
	 *                      this will result in a behaviour with
	 *                      skipAllDelim = true.
	 *                      DEFAULT = false
	 */
	StringTokenizer (const char *str,
			 const char *delim,
			 bool skipAllDelim = false,
			 bool trim = false);

	/**
	 * create a new StringTokenizer which splits the input
	 * string at whitespaces. The tokens are stripped from
	 * whitespaces. This means, if you change the set of
	 * delimiters in either the 'begin(const char *delim)' method
	 * or in 'setDelimiters()', you then get whitespace
	 * trimmed tokens, delimited by the new set.
	 * Behaves like StringTokenizer(s, StringTokenizer::SPACE,false,true);
	 */
	StringTokenizer (const char *s);

	/**
	 * returns the begin iterator
	 */
	iterator begin() const 
		{return iterator(*this);}

	/**
	 * changes the set of delimiters used in subsequent
	 * iterations.
	 */
	void setDelimiters (const char *d) 
		{delim = d;}

	/**
	 * returns a begin iterator with an alternate set of
	 * delimiters.
	 */
	iterator begin(const char *d) 
	{
		delim = d;
		return iterator(*this);
	}

	/**
	 * the iterator marking the end.
	 */
	const iterator& end() const 
		{return itEnd;}
};

#ifdef	CCXX_NAMESPACES
}
#endif

#endif

/** EMACS **
 * Local variables:
 * mode: c++
 * c-basic-offset: 8
 * End:
 */