34 lines
1.1 KiB
Python
34 lines
1.1 KiB
Python
import re
|
|
|
|
STRING = 'one^|uno||three^^^^|four^^^|^cuatro|'
|
|
|
|
def tokenize(string=STRING, escape='^', separator='|'):
|
|
|
|
re_escape, re_separator = map(re.escape, (escape, separator))
|
|
|
|
# token regex
|
|
regex = re.compile(fr'''
|
|
# lookbehind: a token must be preceded by a separator
|
|
# (note that `(?<=^|{re_separator})` doesn't work in Python)
|
|
(?<={re_separator})
|
|
|
|
# a token consists either of an escape sequence,
|
|
# or a regular (non-escape, non-separator) character,
|
|
# repeated arbitrarily many times (even zero)
|
|
(?:{re_escape}.|[^{re_escape}{re_separator}])*
|
|
''',
|
|
flags=re.VERBOSE
|
|
)
|
|
|
|
# since each token must start with a separator,
|
|
# we must add an extra separator at the beginning of input
|
|
preprocessed_string = separator + string
|
|
|
|
for almost_token in regex.findall(preprocessed_string):
|
|
# now get rid of escape characters: '^^' -> '^' etc.
|
|
token = re.sub(fr'{re_escape}(.)', r'\1', almost_token)
|
|
yield token
|
|
|
|
if __name__ == '__main__':
|
|
print(list(tokenize()))
|