RosettaCodeData/Task/Tokenize-a-string-with-esca.../Python/tokenize-a-string-with-esca...

import re

STRING = 'one^|uno||three^^^^|four^^^|^cuatro|'

def tokenize(string=STRING, escape='^', separator='|'):

    re_escape, re_separator = map(re.escape, (escape, separator))

    # token regex
    regex = re.compile(fr'''
        # lookbehind: a token must be preceded by a separator
        # (note that `(?<=^|{re_separator})` doesn't work in Python)
        (?<={re_separator})

        # a token consists either of an escape sequence,
        # or a regular (non-escape, non-separator) character,
        # repeated arbitrarily many times (even zero)
        (?:{re_escape}.|[^{re_escape}{re_separator}])*
      ''',
      flags=re.VERBOSE
    )

    # since each token must start with a separator,
    # we must add an extra separator at the beginning of input
    preprocessed_string = separator + string

    for almost_token in regex.findall(preprocessed_string):
      # now get rid of escape characters: '^^' -> '^' etc.
      token = re.sub(fr'{re_escape}(.)', r'\1', almost_token)
      yield token

if __name__ == '__main__':
    print(list(tokenize()))