RosettaCodeData/Task/S-expressions/Rust/s-expressions.rust

391 lines
15 KiB
Plaintext

//! This implementation isn't based on anything in particular, although it's probably informed by a
//! lot of Rust's JSON encoding code. It should be very fast (both encoding and decoding the toy
//! example here takes under a microsecond on my machine) and tries to avoid unnecessary allocation.
//!
//! In a real implementation, most of this would be private, with only a few visible functions, and
//! there would be somewhat nicer signatures (in particular, the fact that `ParseContext` has to be
//! mutable would get annoying in real code pretty quickly, so it would probably be split out).
//!
//! It supports the ability to read individual atoms, not just lists, although whether this is
//! useful is questionable.
//!
//! Caveats: Does not support symbols vs. non-symbols (it wouldn't be hard, but it would greatly
//! complicate setting up our test structure since we'd have to force it to go through functions
//! that checked to make sure `Symbol`s couldn't have spaces, or slow down our parser by checking
//! for this information each time, which is obnoxious). Does not support string escaping, because
//! the decoding technique doesn't allocate extra space for strings. Does support numbers, but
//! only float types (supporting more types is possible but would complicate the code
//! significantly).
extern crate typed_arena;
use typed_arena::Arena;
use self::Error::*;
use self::SExp::*;
use self::Token::*;
use std::io;
use std::num::FpCategory;
use std::str::FromStr;
/// The actual `SExp` structure. Supports `f64`s, lists, and string literals. Note that it takes
/// everything by reference, rather than owning it--this is mostly done just so we can allocate
/// `SExp`s statically (since we don't have to call `Vec`). It does complicate the code a bit,
/// requiring us to have a `ParseContext` that holds an arena where lists are actually allocated.
#[derive(PartialEq, Debug)]
pub enum SExp<'a> {
/// Float literal: 0.5
F64(f64),
/// List of SExps: ( a b c)
List(&'a [SExp<'a>]),
/// Plain old string literal: "abc"
Str(&'a str),
}
/// Errors that can be thrown by the parser.
#[derive(PartialEq, Debug)]
pub enum Error {
/// If the float is `NaN`, `Infinity`, etc.
NoReprForFloat,
/// Missing an end double quote during string parsing
UnterminatedStringLiteral,
/// Some other kind of I/O error
Io,
/// ) appeared where it shouldn't (usually as the first token)
IncorrectCloseDelimiter,
/// Usually means a missing ), but could also mean there were no tokens at all.
UnexpectedEOF,
/// More tokens after the list is finished, or after a literal if there is no list.
ExpectedEOF,
}
impl From<io::Error> for Error {
fn from(_err: io::Error) -> Error {
Error::Io
}
}
/// Tokens returned from the token stream.
#[derive(PartialEq, Debug)]
enum Token<'a> {
/// Left parenthesis
ListStart,
/// Right parenthesis
ListEnd,
/// String or float literal, quotes removed.
Literal(SExp<'a>),
/// Stream is out of tokens.
Eof,
}
/// An iterator over a string that yields a stream of Tokens.
///
/// Implementation note: it probably seems weird to store first, rest, AND string, since they should
/// all be derivable from string. But see below.
#[derive(Copy, Clone, Debug)]
struct Tokens<'a> {
/// The part of the string that still needs to be parsed
string: &'a str,
/// The first character to parse
first: Option<char>,
/// The rest of the string after the first character
rest: &'a str,
}
impl<'a> Tokens<'a> {
/// Initialize a token stream for a given string.
fn new(string: &str) -> Tokens {
let mut chars = string.chars();
match chars.next() {
Some(ch) => Tokens {
string,
first: Some(ch),
rest: chars.as_str(),
},
None => Tokens {
string,
first: None,
rest: string,
},
}
}
/// Utility function to update information in the iterator. It might not be performant to keep
/// rest cached, but there are times where we don't know exactly what string is (at least, not
/// in a way that we can *safely* reconstruct it without allocating), so we keep both here.
/// With some unsafe code we could probably get rid of one of them (and maybe first, too).
fn update(&mut self, string: &'a str) {
self.string = string;
let mut chars = self.string.chars();
if let Some(ch) = chars.next() {
self.first = Some(ch);
self.rest = chars.as_str();
} else {
self.first = None;
};
}
/// This is where the lexing happens. Note that it does not handle string escaping.
fn next_token(&mut self) -> Result<Token<'a>, Error> {
loop {
match self.first {
// List start
Some('(') => {
self.update(self.rest);
return Ok(ListStart);
}
// List end
Some(')') => {
self.update(self.rest);
return Ok(ListEnd);
}
// Quoted literal start
Some('"') => {
// Split the string at most once. This lets us get a
// reference to the next piece of the string without having
// to loop through the string again.
let mut iter = self.rest.splitn(2, '"');
// The first time splitn is run it will never return None, so this is safe.
let str = iter.next().unwrap();
match iter.next() {
// Extract the interior of the string without allocating. If we want to
// handle string escaping, we would have to allocate at some point though.
Some(s) => {
self.update(s);
return Ok(Literal(Str(str)));
}
None => return Err(UnterminatedStringLiteral),
}
}
// Plain old literal start
Some(c) => {
// Skip whitespace. This could probably be made more efficient.
if c.is_whitespace() {
self.update(self.rest);
continue;
}
// Since we've exhausted all other possibilities, this must be a real literal.
// Unlike the quoted case, it's not an error to encounter EOF before whitespace.
let mut end_ch = None;
let str = {
let mut iter = self.string.splitn(2, |ch: char| {
let term = ch == ')' || ch == '(';
if term {
end_ch = Some(ch)
}
term || ch.is_whitespace()
});
// The first time splitn is run it will never return None, so this is safe.
let str = iter.next().unwrap();
self.rest = iter.next().unwrap_or("");
str
};
match end_ch {
// self.string will be incorrect in the Some(_) case. The only reason it's
// okay is because the next time next() is called in this case, we know it
// will be '(' or ')', so it will never reach any code that actually looks
// at self.string. In a real implementation this would be enforced by
// visibility rules.
Some(_) => self.first = end_ch,
None => self.update(self.rest),
}
return Ok(Literal(parse_literal(str)));
}
None => return Ok(Eof),
}
}
}
}
/// This is not the most efficient way to do this, because we end up going over numeric literals
/// twice, but it avoids having to write our own number parsing logic.
fn parse_literal(literal: &str) -> SExp {
match literal.bytes().next() {
Some(b'0'..=b'9') | Some(b'-') => match f64::from_str(literal) {
Ok(f) => F64(f),
Err(_) => Str(literal),
},
_ => Str(literal),
}
}
/// Parse context, holds information required by the parser (and owns any allocations it makes)
pub struct ParseContext<'a> {
/// The string being parsed. Not required, but convenient.
string: &'a str,
/// Arena holding any allocations made by the parser.
arena: Option<Arena<Vec<SExp<'a>>>>,
/// Stored in the parse context so it can be reused once allocated.
stack: Vec<Vec<SExp<'a>>>,
}
impl<'a> ParseContext<'a> {
/// Create a new parse context from a given string
pub fn new(string: &'a str) -> ParseContext<'a> {
ParseContext {
string,
arena: None,
stack: Vec::new(),
}
}
}
impl<'a> SExp<'a> {
/// Serialize a SExp.
fn encode<T: io::Write>(&self, writer: &mut T) -> Result<(), Error> {
match *self {
F64(f) => {
match f.classify() {
// We don't want to identify NaN, Infinity, etc. as floats.
FpCategory::Normal | FpCategory::Zero => {
write!(writer, "{}", f)?;
Ok(())
}
_ => Err(Error::NoReprForFloat),
}
}
List(l) => {
// Writing a list is very straightforward--write a left parenthesis, then
// recursively call encode on each member, and then write a right parenthesis. The
// only reason the logic is as long as it is is to make sure we don't write
// unnecessary spaces between parentheses in the zero or one element cases.
write!(writer, "(")?;
let mut iter = l.iter();
if let Some(sexp) = iter.next() {
sexp.encode(writer)?;
for sexp in iter {
write!(writer, " ")?;
sexp.encode(writer)?;
}
}
write!(writer, ")")?;
Ok(())
}
Str(s) => {
write!(writer, "\"{}\"", s)?;
Ok(())
}
}
}
/// Deserialize a SExp.
pub fn parse(ctx: &'a mut ParseContext<'a>) -> Result<SExp<'a>, Error> {
ctx.arena = Some(Arena::new());
// Hopefully this unreachable! gets optimized out, because it should literally be
// unreachable.
let arena = match ctx.arena {
Some(ref mut arena) => arena,
None => unreachable!(),
};
let ParseContext {
string,
ref mut stack,
..
} = *ctx;
// Make sure the stack is cleared--we keep it in the context to avoid unnecessary
// reallocation between parses (if you need to remember old parse information for a new
// list, you can pass in a new context).
stack.clear();
let mut tokens = Tokens::new(string);
// First, we check the very first token to see if we're parsing a full list. It
// simplifies parsing a lot in the subsequent code if we can assume that.
let next = tokens.next_token();
let mut list = match next? {
ListStart => Vec::new(),
Literal(s) => {
return if tokens.next_token()? == Eof {
Ok(s)
} else {
Err(ExpectedEOF)
};
}
ListEnd => return Err(IncorrectCloseDelimiter),
Eof => return Err(UnexpectedEOF),
};
// We know we're in a list if we got this far.
loop {
let tok = tokens.next_token();
match tok? {
ListStart => {
// We push the previous context onto our stack when we start reading a new list.
stack.push(list);
list = Vec::new()
}
Literal(s) => list.push(s), // Plain old literal, push it onto the current list
ListEnd => {
match stack.pop() {
// Pop the old context off the stack on list end.
Some(mut l) => {
// We allocate a slot for the current list in our parse context (needed
// for safety) before pushing it onto its parent list.
l.push(List(&*arena.alloc(list)));
// Now reset the current list to the parent list
list = l;
}
// There was nothing on the stack, so we're at the end of the topmost list.
// The check to make sure there are no more tokens is required for
// correctness.
None => {
return match tokens.next_token()? {
Eof => Ok(List(&*arena.alloc(list))),
_ => Err(ExpectedEOF),
};
}
}
}
// We encountered an EOF before the list ended--that's an error.
Eof => return Err(UnexpectedEOF),
}
}
}
/// Convenience method for the common case where you just want to encode a SExp as a String.
pub fn buffer_encode(&self) -> Result<String, Error> {
let mut m = Vec::new();
self.encode(&mut m)?;
// Because encode() only ever writes valid UTF-8, we can safely skip the secondary check we
// normally have to do when converting from Vec<u8> to String. If we didn't know that the
// buffer was already UTF-8, we'd want to call container_as_str() here.
unsafe { Ok(String::from_utf8_unchecked(m)) }
}
}
pub const SEXP_STRUCT: SExp<'static> = List(&[
List(&[Str("data"), Str("quoted data"), F64(123.), F64(4.5)]),
List(&[
Str("data"),
List(&[Str("!@#"), List(&[F64(4.5)]), Str("(more"), Str("data)")]),
]),
]);
pub const SEXP_STRING_IN: &str = r#"((data "quoted data" 123 4.5)
(data (!@# (4.5) "(more" "data)")))"#;
and main.rs:
use s_expressions::{ParseContext, SExp, SEXP_STRING_IN, SEXP_STRUCT};
fn main() {
println!("{:?}", SEXP_STRUCT.buffer_encode());
let ctx = &mut ParseContext::new(SEXP_STRING_IN);
println!("{:?}", SExp::parse(ctx));
}