code.4fips.com/browse/forums/2012/tokenizer

/*
(c) 2012 +++ Filip Stoklas, aka FipS, http://www.4FipS.com +++
THIS CODE IS FREE - LICENSED UNDER THE MIT LICENSE
ARTICLE URL: http://forums.4fips.com/viewtopic.php?f=3&t=810
*/

#include <vector>
#include <string>
#include <cstdio>
#include <cassert>

/// A very lightweight in-place string Tokenizer that can handle quoted tokens.
class Tokenizer
{
 public:

    explicit Tokenizer(const char *text, const char *delims = " \t\n", char quote = '\0');
    const char * next() const;

 private:

    // these are only refs to the external data!
    const char *_text;
    const char *_delims;
    char _quote;

    mutable size_t _offset;
    mutable std::vector<char> _token; // a copy of the current token
};

Tokenizer::Tokenizer(const char *text, const char *delims, char quote):
_text(text),
_delims(delims),
_quote(quote),
_offset(0),
_token()
{
    assert(text && delims);
}

const char * Tokenizer::next() const
{
    static const size_t end_of_string = size_t(-1); // offset terminator: 0xffff...

    if(_offset == end_of_string) // nothing more to consume (from the previous run)
        return nullptr;

    // eat up left delimiter(s), stop at the first token char, quote, or '\0'
    assert(_offset <= strlen(_text)); // it's allowed to point to '\0' as well...
    const char *from = _text + _offset;
    while(*from && strchr(_delims, *from)) ++from;

    const bool quoted_token = *from == _quote;
    const bool double_quotes = quoted_token && from[1] == _quote; // can't overflow here...

    from += quoted_token ? 1 : 0; // if quoted, skip the quote char

    if(!*from) // end-of-string reached
    {
        //  F     F
        // .0 | ."0
        _offset = end_of_string; // finalize & exit
        return nullptr;
    }

    const char *to = double_quotes ? from : from + 1; // optionally collapse quotes
    //  FT     FT     F~~~T
    // .A? | ."A? | .""?

    if(quoted_token)
    {
        if(!double_quotes)
        {
            to = strchr(to, _quote); // stop at the next quote
            if(!to) // missing right quote, discard the whole token
            {
                _offset = end_of_string; // finalize & exit
                return nullptr;
            }
        }
    }
    else
    {
        // stop at the next delim, '\0' or on an unexpected quote
        while(*to != _quote && !strchr(_delims, *to)) ++to;
    }

    //  F  T    F  T     F  T    FxxT ~ unexpected
    // .AAA0 | .AAA. | ."AAA" | .AAA"

    // copy the token
    const size_t size = to - from; // doesn't include trailing '\0'
    _token.resize(size + 1); // + trailing char
    std::copy(from, from + size, &_token[0]); // without trailing '\0'
    _token[size] = '\0'; // put trailing '\0' (needed when shrinking)

    // advance the offset
    if(*to)
    {
        const bool unexpected_quote = !quoted_token && (*to == _quote);
        const size_t step = unexpected_quote ? 0 : 1; // reuse the ending quote
        _offset = to - _text + step;
    }
    else // end-of-string reached
    {
        _offset = end_of_string;
    }

    assert(!_token.empty());
    return &_token[0];
}

int main()
{
    {
        // tokenize a string using <comma> and <space> as delimiters,
        // and <apostrophe> as a quote character:

        const Tokenizer tk("John, Jane, 'John Doe', 'Jane Roe'", ", ", '\'');
        for(const char *token = tk.next(); token; token = tk.next())
        {
            printf("<%s>\n", token);
        }
    }

    {
        // tokenize key/value pairs (with optional '=' separator),
        // note that multiple consecutive quoted tokens are treated individually:

        const Tokenizer tk("key1 = value1 | 'multi key2' = 'multi value2' | 'key3''value3'", "|", '\0');
        for(const char *token = tk.next(); token; token = tk.next())
        {
            const Tokenizer tk2(token, "= ", '\''); // we need both here: '=' and <space>
            std::string key = tk2.next(); // keep a copy
            const char *value = tk2.next();
            printf("<%s> = <%s>\n", key.c_str(), value);
        }
    }

    // also, run a bunch of tests:

    auto test_tokenizer = [](const std::string &input, const std::string &expected_result)->bool
    {
        std::string result;
        const Tokenizer tk(input.c_str(), ".", '\'');
        for(const char *token = tk.next(); token; token = tk.next())
        {
            result += std::string("<") + token + ">";
        }
        return result == expected_result;
    };

    // well formatted
    assert(test_tokenizer(".tok.tok.'.tok.'.'.tok.''.tok.'", "<tok><tok><.tok.><.tok.><.tok.>"));
    assert(test_tokenizer("", ""));
    assert(test_tokenizer(".", ""));
    assert(test_tokenizer("''", "<>"));
    assert(test_tokenizer("'.'", "<.>"));
    assert(test_tokenizer("''''''", "<><><>"));
    assert(test_tokenizer(".''.''.''.", "<><><>"));
    assert(test_tokenizer("A", "<A>"));
    assert(test_tokenizer("ABC", "<ABC>"));
    assert(test_tokenizer("'A''B''C'", "<A><B><C>"));
    assert(test_tokenizer(".A.", "<A>"));
    assert(test_tokenizer(".'.A.'.", "<.A.>"));
    assert(test_tokenizer(".A.'.B.'.C.", "<A><.B.><C>"));
    assert(test_tokenizer(".'.ABC.'.'.DEF.'.", "<.ABC.><.DEF.>"));

    // mismatching right quotes
    assert(test_tokenizer(".tok.tok.'.tok.'.'.tok.''.tok.''.ignored.", "<tok><tok><.tok.><.tok.><.tok.>"));
    assert(test_tokenizer("'", ""));
    assert(test_tokenizer(".'.", ""));
    assert(test_tokenizer("'''", "<>"));
    assert(test_tokenizer(".'''.", "<>"));
    assert(test_tokenizer(".''.''.'.", "<><>"));
    assert(test_tokenizer("'A'B'", "<A><B>"));
    assert(test_tokenizer(".A.'.B.'.C.'.D.", "<A><.B.><C>"));
    assert(test_tokenizer("A.''B'C.D", "<A><><B>"));
    assert(test_tokenizer(".A.''.B.'''''.C.", "<A><><B><><>"));

    return 0;
}

// output:
// <John>
// <Jane>
// <John Doe>
// <Jane Roe>
// <key1> = <value1>
// <multi key2> = <multi value2>
// <key3> = <value3>