RosettaCodeData/Task/Tokenize-a-string-with-esca.../SNOBOL4/tokenize-a-string-with-esca...

* Program: tokenize_with_escape.sbl
* To run: sbl tokenize_with_escape.sbl
* Description: Tokenize a string with escaping
* Comment: Tested using the Spitbol for Linux version of SNOBOL4

	lf = substr(&alphabet,11,1) ;* New line or line feed


* Function tokenize will break parts out of a string, which are
* separated by c, which defaults to a comma, into
* an array. Parameter kp=1 to keep null parts, which is the default,
* and 0 to discard.
	define('tokenize(s,c,kp)tokenizepat,part,t,i,j')
	:(tokenize_end)
tokenize
	c = (ident(c) ',', substr(c,1,1)) :f(freturn)
	kp = (ident(kp) 1, eq(kp,0) 0, 1) :f(freturn)
	t = table()
	tokenizepat = breakx(c) . part c | (len(1) rem) . part
	s ? eq(kp,1) rtab(1) c = s c
tokenize1
	s ? tokenizepat = "" :f(tokenize2)
	t[i = eq(kp,0) differ(part) i + 1] = part
	t[i = eq(kp,1) i + 1] = part
	:(tokenize1)
tokenize2
	tokenize = array(i) :f(errr)
	j = 0
tokenize3	tokenize[j = lt(j,i) j + 1] = t[j] :s(tokenize3)
	:(return)
tokenize_end


* Function tokcan will a normalize a string by applying separator and escape
* rules to string ts. Parameter sep is the separator, while esc is the escape
* character. Parameter tesc is the new separator character to substitute for
* parameter sep. It defaults to a comma, ",".
	define('tokcan(ts,sep,esc,tesc)tpat,part1,part2,notany') :(tokcan_end)
tokcan
	tesc = (ident(tesc) ',', substr(tesc,1,1))
	tpat = (breakx(sep esc) . part1
+		(sep | esc sep | esc esc | (esc len(1) . notany)) . part2
+		)
+		| (len(1) rem) . part1

tokcan1
	ts ? tpat = :f(tokcan2)
	part2 = (leq(part2,sep) tesc
+		,leq(part2,esc sep) sep
+		,leq(part2,esc esc) esc
+		,differ(notany) leq(part2,esc notany) notany
+		)
	tokcan = (ident(tokcan) "", tokcan) part1 part2
	:(tokcan1)
tokcan2
	:(return)
tokcan_end


	test_string = "one^|uno||three^^^^|four^^^|^cuatro|"
	sep = "|"
	esc = "^"

	hline = tokcan(test_string,sep,esc) :f(err)


	output = "  Input: " test_string lf
	output = "Output1: " hline lf

	output = "Output2: "
	tokenized = tokenize(hline,",")

p1	output = "'" tokenized[z = z + 1] "'" :s(p1)

END