RosettaCodeData/Task/Tokenize-a-string-with-esca.../Python/tokenize-a-string-with-esca...

34 lines
1.1 KiB
Python

import re
STRING = 'one^|uno||three^^^^|four^^^|^cuatro|'
def tokenize(string=STRING, escape='^', separator='|'):
re_escape, re_separator = map(re.escape, (escape, separator))
# token regex
regex = re.compile(fr'''
# lookbehind: a token must be preceded by a separator
# (note that `(?<=^|{re_separator})` doesn't work in Python)
(?<={re_separator})
# a token consists either of an escape sequence,
# or a regular (non-escape, non-separator) character,
# repeated arbitrarily many times (even zero)
(?:{re_escape}.|[^{re_escape}{re_separator}])*
''',
flags=re.VERBOSE
)
# since each token must start with a separator,
# we must add an extra separator at the beginning of input
preprocessed_string = separator + string
for almost_token in regex.findall(preprocessed_string):
# now get rid of escape characters: '^^' -> '^' etc.
token = re.sub(fr'{re_escape}(.)', r'\1', almost_token)
yield token
if __name__ == '__main__':
print(list(tokenize()))