39 lines
980 B
Python
39 lines
980 B
Python
import re
|
|
|
|
STRING = 'one^|uno||three^^^^|four^^^|^cuatro|'
|
|
|
|
def tokenize(string=STRING, escape='^', separator='|'):
|
|
|
|
escape, separator = map(re.escape, (escape, separator))
|
|
|
|
tokens = ['']
|
|
|
|
def start_new_token(scanner, substring):
|
|
tokens.append('')
|
|
|
|
def add_escaped_char(scanner, substring):
|
|
char = substring[1]
|
|
tokens[-1] += char
|
|
|
|
def add_substring(scanner, substring):
|
|
tokens[-1] += substring
|
|
|
|
re.Scanner([
|
|
# an escape followed by a character produces that character
|
|
(fr'{escape}.', add_escaped_char),
|
|
|
|
# when encountering a separator not preceded by an escape,
|
|
# start a new token
|
|
(fr'{separator}', start_new_token),
|
|
|
|
# a sequence of regular characters (i.e. not escape or separator)
|
|
# is just appended to the token
|
|
(fr'[^{escape}{separator}]+', add_substring),
|
|
]).scan(string)
|
|
|
|
return tokens
|
|
|
|
|
|
if __name__ == '__main__':
|
|
print(list(tokenize()))
|