RosettaCodeData/Task/Tokenize-a-string-with-esca.../Python/tokenize-a-string-with-esca...

39 lines
980 B
Python

import re
STRING = 'one^|uno||three^^^^|four^^^|^cuatro|'
def tokenize(string=STRING, escape='^', separator='|'):
escape, separator = map(re.escape, (escape, separator))
tokens = ['']
def start_new_token(scanner, substring):
tokens.append('')
def add_escaped_char(scanner, substring):
char = substring[1]
tokens[-1] += char
def add_substring(scanner, substring):
tokens[-1] += substring
re.Scanner([
# an escape followed by a character produces that character
(fr'{escape}.', add_escaped_char),
# when encountering a separator not preceded by an escape,
# start a new token
(fr'{separator}', start_new_token),
# a sequence of regular characters (i.e. not escape or separator)
# is just appended to the token
(fr'[^{escape}{separator}]+', add_substring),
]).scan(string)
return tokens
if __name__ == '__main__':
print(list(tokenize()))