RosettaCodeData/Task/S-expressions/Rust/s-expressions.rust

//! This implementation isn't based on anything in particular, although it's probably informed by a
//! lot of Rust's JSON encoding code.  It should be very fast (both encoding and decoding the toy
//! example here takes under a microsecond on my machine) and tries to avoid unnecessary allocation.
//!
//! In a real implementation, most of this would be private, with only a few visible functions, and
//! there would be somewhat nicer signatures (in particular, the fact that `ParseContext` has to be
//! mutable would get annoying in real code pretty quickly, so it would probably be split out).
//!
//! It supports the ability to read individual atoms, not just lists, although whether this is
//! useful is questionable.
//!
//! Caveats: Does not support symbols vs. non-symbols (it wouldn't be hard, but it would greatly
//! complicate setting up our test structure since we'd have to force it to go through functions
//! that checked to make sure `Symbol`s couldn't have spaces, or slow down our parser by checking
//! for this information each time, which is obnoxious).  Does not support string escaping, because
//! the decoding technique doesn't allocate extra space for strings.  Does support numbers, but
//! only float types (supporting more types is possible but would complicate the code
//! significantly).

extern crate typed_arena;

use typed_arena::Arena;

use self::Error::*;
use self::SExp::*;
use self::Token::*;
use std::io;
use std::num::FpCategory;
use std::str::FromStr;

/// The actual `SExp` structure.  Supports `f64`s, lists, and string literals.  Note that it takes
/// everything by reference, rather than owning it--this is mostly done just so we can allocate
/// `SExp`s statically (since we don't have to call `Vec`).  It does complicate the code a bit,
/// requiring us to have a `ParseContext` that holds an arena where lists are actually allocated.
#[derive(PartialEq, Debug)]
pub enum SExp<'a> {
    /// Float literal: 0.5
    F64(f64),

    /// List of SExps: ( a b c)
    List(&'a [SExp<'a>]),

    /// Plain old string literal: "abc"
    Str(&'a str),
}

/// Errors that can be thrown by the parser.
#[derive(PartialEq, Debug)]
pub enum Error {
    /// If the float is `NaN`, `Infinity`, etc.
    NoReprForFloat,

    /// Missing an end double quote during string parsing
    UnterminatedStringLiteral,

    /// Some other kind of I/O error
    Io,

    /// ) appeared where it shouldn't (usually as the first token)
    IncorrectCloseDelimiter,

    /// Usually means a missing ), but could also mean there were no tokens at all.
    UnexpectedEOF,

    /// More tokens after the list is finished, or after a literal if there is no list.
    ExpectedEOF,
}

impl From<io::Error> for Error {
    fn from(_err: io::Error) -> Error {
        Error::Io
    }
}

/// Tokens returned from the token stream.
#[derive(PartialEq, Debug)]
enum Token<'a> {
    /// Left parenthesis
    ListStart,

    /// Right parenthesis
    ListEnd,

    /// String or float literal, quotes removed.
    Literal(SExp<'a>),

    /// Stream is out of tokens.
    Eof,
}

/// An iterator over a string that yields a stream of Tokens.
///
/// Implementation note: it probably seems weird to store first, rest, AND string, since they should
/// all be derivable from string.  But see below.
#[derive(Copy, Clone, Debug)]
struct Tokens<'a> {
    /// The part of the string that still needs to be parsed
    string: &'a str,

    /// The first character to parse
    first: Option<char>,

    /// The rest of the string after the first character
    rest: &'a str,
}

impl<'a> Tokens<'a> {
    /// Initialize a token stream for a given string.
    fn new(string: &str) -> Tokens {
        let mut chars = string.chars();

        match chars.next() {
            Some(ch) => Tokens {
                string,
                first: Some(ch),
                rest: chars.as_str(),
            },
            None => Tokens {
                string,
                first: None,
                rest: string,
            },
        }
    }

    /// Utility function to update information in the iterator.  It might not be performant to keep
    /// rest cached, but there are times where we don't know exactly what string is (at least, not
    /// in a way that we can *safely* reconstruct it without allocating), so we keep both here.
    /// With some unsafe code we could probably get rid of one of them (and maybe first, too).
    fn update(&mut self, string: &'a str) {
        self.string = string;
        let mut chars = self.string.chars();

        if let Some(ch) = chars.next() {
            self.first = Some(ch);
            self.rest = chars.as_str();
        } else {
            self.first = None;
        };
    }

    /// This is where the lexing happens.  Note that it does not handle string escaping.
    fn next_token(&mut self) -> Result<Token<'a>, Error> {
        loop {
            match self.first {
                // List start
                Some('(') => {
                    self.update(self.rest);
                    return Ok(ListStart);
                }
                // List end
                Some(')') => {
                    self.update(self.rest);
                    return Ok(ListEnd);
                }
                // Quoted literal start
                Some('"') => {
                    // Split the string at most once.  This lets us get a
                    // reference to the next piece of the string without having
                    // to loop through the string again.
                    let mut iter = self.rest.splitn(2, '"');
                    // The first time splitn is run it will never return None, so this is safe.
                    let str = iter.next().unwrap();
                    match iter.next() {
                        // Extract the interior of the string without allocating.  If we want to
                        // handle string escaping, we would have to allocate at some point though.
                        Some(s) => {
                            self.update(s);
                            return Ok(Literal(Str(str)));
                        }
                        None => return Err(UnterminatedStringLiteral),
                    }
                }
                // Plain old literal start
                Some(c) => {
                    // Skip whitespace.  This could probably be made more efficient.
                    if c.is_whitespace() {
                        self.update(self.rest);
                        continue;
                    }
                    // Since we've exhausted all other possibilities, this must be a real literal.
                    // Unlike the quoted case, it's not an error to encounter EOF before whitespace.
                    let mut end_ch = None;
                    let str = {
                        let mut iter = self.string.splitn(2, |ch: char| {
                            let term = ch == ')' || ch == '(';
                            if term {
                                end_ch = Some(ch)
                            }
                            term || ch.is_whitespace()
                        });
                        // The first time splitn is run it will never return None, so this is safe.
                        let str = iter.next().unwrap();
                        self.rest = iter.next().unwrap_or("");
                        str
                    };
                    match end_ch {
                        // self.string will be incorrect in the Some(_) case.  The only reason it's
                        // okay is because the next time next() is called in this case, we know it
                        // will be '(' or ')', so it will never reach any code that actually looks
                        // at self.string.  In a real implementation this would be enforced by
                        // visibility rules.
                        Some(_) => self.first = end_ch,
                        None => self.update(self.rest),
                    }
                    return Ok(Literal(parse_literal(str)));
                }
                None => return Ok(Eof),
            }
        }
    }
}

/// This is not the most efficient way to do this, because we end up going over numeric literals
/// twice, but it avoids having to write our own number parsing logic.
fn parse_literal(literal: &str) -> SExp {
    match literal.bytes().next() {
        Some(b'0'..=b'9') | Some(b'-') => match f64::from_str(literal) {
            Ok(f) => F64(f),
            Err(_) => Str(literal),
        },
        _ => Str(literal),
    }
}

/// Parse context, holds information required by the parser (and owns any allocations it makes)
pub struct ParseContext<'a> {
    /// The string being parsed.  Not required, but convenient.
    string: &'a str,

    /// Arena holding any allocations made by the parser.
    arena: Option<Arena<Vec<SExp<'a>>>>,

    /// Stored in the parse context so it can be reused once allocated.
    stack: Vec<Vec<SExp<'a>>>,
}

impl<'a> ParseContext<'a> {
    /// Create a new parse context from a given string
    pub fn new(string: &'a str) -> ParseContext<'a> {
        ParseContext {
            string,
            arena: None,
            stack: Vec::new(),
        }
    }
}

impl<'a> SExp<'a> {
    /// Serialize a SExp.
    fn encode<T: io::Write>(&self, writer: &mut T) -> Result<(), Error> {
        match *self {
            F64(f) => {
                match f.classify() {
                    // We don't want to identify NaN, Infinity, etc. as floats.
                    FpCategory::Normal | FpCategory::Zero => {
                        write!(writer, "{}", f)?;
                        Ok(())
                    }
                    _ => Err(Error::NoReprForFloat),
                }
            }
            List(l) => {
                // Writing a list is very straightforward--write a left parenthesis, then
                // recursively call encode on each member, and then write a right parenthesis.  The
                // only reason the logic is as long as it is is to make sure we don't write
                // unnecessary spaces between parentheses in the zero or one element cases.
                write!(writer, "(")?;
                let mut iter = l.iter();
                if let Some(sexp) = iter.next() {
                    sexp.encode(writer)?;
                    for sexp in iter {
                        write!(writer, " ")?;
                        sexp.encode(writer)?;
                    }
                }
                write!(writer, ")")?;
                Ok(())
            }
            Str(s) => {
                write!(writer, "\"{}\"", s)?;
                Ok(())
            }
        }
    }

    /// Deserialize a SExp.
    pub fn parse(ctx: &'a mut ParseContext<'a>) -> Result<SExp<'a>, Error> {
        ctx.arena = Some(Arena::new());
        // Hopefully this unreachable! gets optimized out, because it should literally be
        // unreachable.
        let arena = match ctx.arena {
            Some(ref mut arena) => arena,
            None => unreachable!(),
        };
        let ParseContext {
            string,
            ref mut stack,
            ..
        } = *ctx;
        // Make sure the stack is cleared--we keep it in the context to avoid unnecessary
        // reallocation between parses (if you need to remember old parse information for a new
        // list, you can pass in a new context).
        stack.clear();
        let mut tokens = Tokens::new(string);
        // First, we check the very first token to see if we're parsing a full list.  It
        // simplifies parsing a lot in the subsequent code if we can assume that.
        let next = tokens.next_token();
        let mut list = match next? {
            ListStart => Vec::new(),
            Literal(s) => {
                return if tokens.next_token()? == Eof {
                    Ok(s)
                } else {
                    Err(ExpectedEOF)
                };
            }
            ListEnd => return Err(IncorrectCloseDelimiter),
            Eof => return Err(UnexpectedEOF),
        };

        // We know we're in a list if we got this far.
        loop {
            let tok = tokens.next_token();
            match tok? {
                ListStart => {
                    // We push the previous context onto our stack when we start reading a new list.
                    stack.push(list);
                    list = Vec::new()
                }
                Literal(s) => list.push(s), // Plain old literal, push it onto the current list
                ListEnd => {
                    match stack.pop() {
                        // Pop the old context off the stack on list end.
                        Some(mut l) => {
                            // We allocate a slot for the current list in our parse context (needed
                            // for safety) before pushing it onto its parent list.
                            l.push(List(&*arena.alloc(list)));
                            // Now reset the current list to the parent list
                            list = l;
                        }
                        // There was nothing on the stack, so we're at the end of the topmost list.
                        // The check to make sure there are no more tokens is required for
                        // correctness.
                        None => {
                            return match tokens.next_token()? {
                                Eof => Ok(List(&*arena.alloc(list))),
                                _ => Err(ExpectedEOF),
                            };
                        }
                    }
                }
                // We encountered an EOF before the list ended--that's an error.
                Eof => return Err(UnexpectedEOF),
            }
        }
    }

    /// Convenience method for the common case where you just want to encode a SExp as a String.
    pub fn buffer_encode(&self) -> Result<String, Error> {
        let mut m = Vec::new();
        self.encode(&mut m)?;
        // Because encode() only ever writes valid UTF-8, we can safely skip the secondary check we
        // normally have to do when converting from Vec<u8> to String.  If we didn't know that the
        // buffer was already UTF-8, we'd want to call container_as_str() here.
        unsafe { Ok(String::from_utf8_unchecked(m)) }
    }
}

pub const SEXP_STRUCT: SExp<'static> = List(&[
    List(&[Str("data"), Str("quoted data"), F64(123.), F64(4.5)]),
    List(&[
        Str("data"),
        List(&[Str("!@#"), List(&[F64(4.5)]), Str("(more"), Str("data)")]),
    ]),
]);

pub const SEXP_STRING_IN: &str = r#"((data "quoted data" 123 4.5)
(data (!@# (4.5) "(more" "data)")))"#;


and main.rs:

use s_expressions::{ParseContext, SExp, SEXP_STRING_IN, SEXP_STRUCT};

fn main() {
    println!("{:?}", SEXP_STRUCT.buffer_encode());
    let ctx = &mut ParseContext::new(SEXP_STRING_IN);
    println!("{:?}", SExp::parse(ctx));
}