diff --git a/grammar.ebnf b/grammar.ebnf index 60e3641..2f39fd3 100644 --- a/grammar.ebnf +++ b/grammar.ebnf @@ -4,16 +4,18 @@ token ::= insert | meta-command | int | string + | semicolon | end-of-file /* command is second stage of parsing */ -command ::= cmd-insert - | cmd-select +command ::= cmd-insert semicolon + | cmd-select semicolon cmd-insert ::= insert int string string cmd-select ::= select insert ::= "insert" select ::= "select" +semicolon ::= ";" meta-command ::= "." "exit" | "about" diff --git a/notes.org b/notes.org index 223e8e9..8a40aac 100644 --- a/notes.org +++ b/notes.org @@ -231,14 +231,27 @@ i will use rustyline, since it seems like the most feature-complete * TODO .license meta-command * TODO .help meta-command -* TODO parse insert statements in the form +* DONE parse insert statements in the form insert -** TODO Row struct -** TODO parse row insert -** TODO separate statements with semicolons -** TODO in case of parse error, skip until next semicolon to better recover -** TODO serialize/deserialize row to/from raw bytes -*** TODO look for best practices for creating binary formats +** DONE parse row insert +* DONE separate statements with semicolons +* TODO this error message could be better +#+begin example +Error: unexpected token + ╭─[ :1:24 ] + │ + 1 │ insert 0 "user" "email" + │ │ + │ ╰─ found end of file "" + │ + │ Note: expected token type to be one of ["semicolon"] +───╯ +#+end example +* TODO correct all instances of in locations +* TODO meta-commands must be followed by end-of-file +* DONE in case of parse error, skip until next semicolon to better recover +* TODO serialize/deserialize row to/from raw bytes +** TODO look for best practices for creating binary formats * WAIT cli tests using insta-cmd https://insta.rs/docs/cmd/ diff --git a/src/meta_commands.rs b/src/meta_commands.rs index 135e1ab..fb5d950 100644 --- a/src/meta_commands.rs +++ b/src/meta_commands.rs @@ -1,6 +1,6 @@ use crate::branding; -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug, Eq, PartialEq, Clone)] pub enum MetaCommand { Exit, About, diff --git a/src/parser.rs b/src/parser.rs index bd6b087..585f5f8 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -6,40 +6,123 @@ use crate::{ tokens::{Location, Token, TokenData, tokenize}, }; +// Helper function to skip tokens until reaching a semicolon or end of file +// This helps with error recovery when a statement has a syntax error +fn skip_to_next_statement(tokens: &mut VecDeque) { + while let Some(token) = tokens.front() { + match token.data { + TokenData::Semicolon | TokenData::EndOfFile => break, + _ => { + tokens.pop_front(); + } + } + } + + // Consume the semicolon if that's what we stopped at + if tokens + .front() + .is_some_and(|t| matches!(t.data, TokenData::Semicolon)) + { + tokens.pop_front(); + } +} + +// Helper function to check for a semicolon after a statement +fn expect_semicolon(tokens: &mut VecDeque) -> Result<(), CommandParseError> { + if let Some(next_token) = tokens.front() { + match next_token.data { + TokenData::Semicolon => { + tokens.pop_front(); // Consume the semicolon + Ok(()) + } + _ => Err(CommandParseError::UnexpectedToken( + next_token.clone(), + &["semicolon"], + )), + } + } else { + // Even at the end of input, we need a semicolon + Err(CommandParseError::UnexpectedToken( + Token { + location: tokens.back().map_or_else( + || Location::new(String::from(""), 0, 0), + |t| t.location.clone(), + ), + data: TokenData::EndOfFile, + lexeme: String::new(), + }, + &["semicolon"], + )) + } +} + +fn parse_select_command( + tokens: &mut VecDeque, +) -> Result { + // Parse the select command (currently doesn't require additional tokens) + let cmd = Command::Statement(Statement::Select); + + // Check for semicolon after select command + expect_semicolon(tokens)?; + + Ok(cmd) +} + pub fn parse(file: String, input: String) -> Result, Vec> { let mut tokens: VecDeque<_> = tokenize(input, file) .map_err(|x| x.into_iter().map(|x| x.into()).collect::>())? .into(); let mut cmds = Vec::new(); let mut errs = Vec::new(); + while let Some(token) = tokens.pop_front() { match token.data { - crate::tokens::TokenData::Insert => match parse_insert_command(&mut tokens) { + TokenData::Insert => match parse_insert_command(&mut tokens) { Ok(cmd) => cmds.push(cmd), - Err(err) => errs.push(err), + Err(err) => { + errs.push(err); + skip_to_next_statement(&mut tokens); // Skip to next statement for error recovery + } }, - crate::tokens::TokenData::Select => cmds.push(Command::Statement(Statement::Select)), - crate::tokens::TokenData::MetaCommand(meta_command) => { - cmds.push(Command::MetaCommand(meta_command)) + TokenData::Select => match parse_select_command(&mut tokens) { + Ok(cmd) => cmds.push(cmd), + Err(err) => { + errs.push(err); + skip_to_next_statement(&mut tokens); // Skip to next statement for error recovery + } + }, + TokenData::MetaCommand(meta_command) => { + // Meta commands don't require semicolons per grammar + cmds.push(Command::MetaCommand(meta_command)); } - crate::tokens::TokenData::Int(_) => errs.push(CommandParseError::UnexpectedToken( - token, - &["statement", "meta command", "eof"], - )), - crate::tokens::TokenData::String(_) => errs.push(CommandParseError::UnexpectedToken( - token, - &["statement", "meta command", "eof"], - )), - crate::tokens::TokenData::EndOfFile => (), + TokenData::Semicolon => { + // Empty statement (just a semicolon) - ignore it + } + TokenData::Int(_) => { + errs.push(CommandParseError::UnexpectedToken( + token, + &["statement", "meta command", "eof"], + )); + skip_to_next_statement(&mut tokens); + } + TokenData::String(_) => { + errs.push(CommandParseError::UnexpectedToken( + token, + &["statement", "meta command", "eof"], + )); + skip_to_next_statement(&mut tokens); + } + TokenData::EndOfFile => (), // End of parsing } } + if errs.is_empty() { Ok(cmds) } else { Err(errs) } } fn parse_insert_command( tokens: &mut VecDeque, ) -> Result { - // According to grammar.ebnf, insert command should be: insert int string string + // According to grammar.ebnf, insert command should be: insert int string string semicolon // Parse the id (integer) let id_token = tokens.pop_front().ok_or_else(|| { @@ -106,6 +189,9 @@ fn parse_insert_command( _ => return Err(CommandParseError::UnexpectedToken(email_token, &["string"])), }; + // Check for semicolon after the insert command + expect_semicolon(tokens)?; + Ok(Command::Statement(Statement::Insert { id, username, @@ -122,8 +208,8 @@ mod tests { fn test_parse_single_correct() { let file = String::from(""); assert_debug_snapshot!(parse(file.clone(), String::from(".exit"))); - assert_debug_snapshot!(parse(file.clone(), String::from("select"))); - assert_debug_snapshot!(parse(file.clone(), String::from("sElEcT"))); + assert_debug_snapshot!(parse(file.clone(), String::from("select;"))); + assert_debug_snapshot!(parse(file.clone(), String::from("sElEcT;"))); } #[test] @@ -131,13 +217,23 @@ mod tests { let file = String::from(""); assert_debug_snapshot!(parse( file.clone(), - String::from(r#"insert 1 "username" "email@example.com""#) + String::from(r#"insert 1 "username" "email@example.com";"#) )); assert_debug_snapshot!(parse( file.clone(), - String::from(r#"insert "not_an_id" "username" "email@example.com""#) + String::from(r#"insert "not_an_id" "username" "email@example.com";"#) + )); + assert_debug_snapshot!(parse(file.clone(), String::from(r#"insert 1 "username";"#))); + } + + #[test] + fn test_parse_missing_semicolon() { + let file = String::from(""); + assert_debug_snapshot!(parse(file.clone(), String::from("select"))); + assert_debug_snapshot!(parse( + file.clone(), + String::from(r#"insert 1 "username" "email@example.com""#) )); - assert_debug_snapshot!(parse(file.clone(), String::from(r#"insert 1 "username""#))); } #[test] @@ -155,7 +251,16 @@ mod tests { let file = String::from(""); assert_debug_snapshot!(parse( file.clone(), - String::from(".exit select select select") + String::from(".exit select; select; select;") + )); + } + + #[test] + fn test_parse_multiple_statements_with_insert() { + let file = String::from(""); + assert_debug_snapshot!(parse( + file.clone(), + String::from(r#"select; insert 1 "user" "email@test.com"; select;"#) )); } diff --git a/src/snapshots/osdb__parser__tests__parse_insert_command-2.snap b/src/snapshots/osdb__parser__tests__parse_insert_command-2.snap index 8582c85..72d9ec6 100644 --- a/src/snapshots/osdb__parser__tests__parse_insert_command-2.snap +++ b/src/snapshots/osdb__parser__tests__parse_insert_command-2.snap @@ -1,6 +1,6 @@ --- source: src/parser.rs -expression: "parse(file.clone(),\nString::from(r#\"insert \"not_an_id\" \"username\" \"email@example.com\"\"#))" +expression: "parse(file.clone(),\nString::from(r#\"insert \"not_an_id\" \"username\" \"email@example.com\";\"#))" --- Err( [ @@ -20,41 +20,5 @@ Err( "integer", ], ), - UnexpectedToken( - Token { - location: Location { - file: "", - offset: 19, - length: 10, - }, - data: String( - "username", - ), - lexeme: "\"username\"", - }, - [ - "statement", - "meta command", - "eof", - ], - ), - UnexpectedToken( - Token { - location: Location { - file: "", - offset: 30, - length: 19, - }, - data: String( - "email@example.com", - ), - lexeme: "\"email@example.com\"", - }, - [ - "statement", - "meta command", - "eof", - ], - ), ], ) diff --git a/src/snapshots/osdb__parser__tests__parse_insert_command-3.snap b/src/snapshots/osdb__parser__tests__parse_insert_command-3.snap index dc13e7a..a4fbebd 100644 --- a/src/snapshots/osdb__parser__tests__parse_insert_command-3.snap +++ b/src/snapshots/osdb__parser__tests__parse_insert_command-3.snap @@ -1,6 +1,6 @@ --- source: src/parser.rs -expression: "parse(file.clone(), String::from(r#\"insert 1 \"username\"\"#))" +expression: "parse(file.clone(), String::from(r#\"insert 1 \"username\";\"#))" --- Err( [ @@ -9,10 +9,10 @@ Err( location: Location { file: "", offset: 19, - length: 0, + length: 1, }, - data: EndOfFile, - lexeme: "", + data: Semicolon, + lexeme: ";", }, [ "string", diff --git a/src/snapshots/osdb__parser__tests__parse_missing_semicolon-2.snap b/src/snapshots/osdb__parser__tests__parse_missing_semicolon-2.snap new file mode 100644 index 0000000..a6787ca --- /dev/null +++ b/src/snapshots/osdb__parser__tests__parse_missing_semicolon-2.snap @@ -0,0 +1,22 @@ +--- +source: src/parser.rs +expression: "parse(file.clone(),\nString::from(r#\"insert 1 \"username\" \"email@example.com\"\"#))" +--- +Err( + [ + UnexpectedToken( + Token { + location: Location { + file: "", + offset: 39, + length: 0, + }, + data: EndOfFile, + lexeme: "", + }, + [ + "semicolon", + ], + ), + ], +) diff --git a/src/snapshots/osdb__parser__tests__parse_missing_semicolon.snap b/src/snapshots/osdb__parser__tests__parse_missing_semicolon.snap new file mode 100644 index 0000000..ad0856f --- /dev/null +++ b/src/snapshots/osdb__parser__tests__parse_missing_semicolon.snap @@ -0,0 +1,22 @@ +--- +source: src/parser.rs +expression: "parse(file.clone(), String::from(\"select\"))" +--- +Err( + [ + UnexpectedToken( + Token { + location: Location { + file: "", + offset: 6, + length: 0, + }, + data: EndOfFile, + lexeme: "", + }, + [ + "semicolon", + ], + ), + ], +) diff --git a/src/snapshots/osdb__parser__tests__parse_multiple_statements_with_insert.snap b/src/snapshots/osdb__parser__tests__parse_multiple_statements_with_insert.snap new file mode 100644 index 0000000..0305714 --- /dev/null +++ b/src/snapshots/osdb__parser__tests__parse_multiple_statements_with_insert.snap @@ -0,0 +1,21 @@ +--- +source: src/parser.rs +expression: "parse(file.clone(),\nString::from(r#\"select; insert 1 \"user\" \"email@test.com\"; select;\"#))" +--- +Ok( + [ + Statement( + Select, + ), + Statement( + Insert { + id: 1, + username: "user", + email: "email@test.com", + }, + ), + Statement( + Select, + ), + ], +) diff --git a/src/statements.rs b/src/statements.rs index 3799caf..cf5b3fa 100644 --- a/src/statements.rs +++ b/src/statements.rs @@ -20,7 +20,7 @@ impl Statement { username, email, } => StatementExecuteResult { - msg: String::from(format!("insert {id:?} {username:?} {email:?}")), + msg: format!("insert {id:?} {username:?} {email:?}"), }, Statement::Select => StatementExecuteResult { msg: String::from("select"), diff --git a/src/tokens.rs b/src/tokens.rs index 14316a2..ed2f23b 100644 --- a/src/tokens.rs +++ b/src/tokens.rs @@ -1,6 +1,6 @@ use crate::meta_commands::MetaCommand; -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug, Eq, PartialEq, Clone)] pub enum TokenData { Insert, Select, @@ -8,6 +8,7 @@ pub enum TokenData { EndOfFile, Int(i64), String(String), + Semicolon, } #[derive(Debug, Eq, PartialEq, Clone)] @@ -46,7 +47,7 @@ impl Location { } } -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug, Eq, PartialEq, Clone)] pub struct Token { /// Where in the input was this token found? pub location: Location, @@ -65,6 +66,7 @@ impl std::fmt::Display for Token { TokenData::EndOfFile => write!(f, "end of file"), TokenData::Int(x) => write!(f, "integer {x}"), TokenData::String(x) => write!(f, "string {x:?}"), + TokenData::Semicolon => write!(f, "semicolon"), }?; let lexeme = &self.lexeme; write!(f, " {lexeme:?}") @@ -307,6 +309,15 @@ impl Tokenizer { } } + fn scan_semicolon(&mut self) -> Result { + self.advance(); + Ok(Token { + location: self.previous_location(1), + data: TokenData::Semicolon, + lexeme: String::from(";"), + }) + } + fn scan_token(&mut self) -> Result, ScanError> { loop { if let Some(c) = self.peek() { @@ -318,6 +329,8 @@ impl Tokenizer { return self.scan_integer().map(Some); } else if c == '"' { return self.scan_string().map(Some); + } else if c == ';' { + return self.scan_semicolon().map(Some); } else if c.is_whitespace() { self.advance(); } else {