feat(parser): implement semicolon-separated statements

Add support for semicolon-terminated statements according to the
updated grammar. This change enables executing multiple SQL statements
in a single input by separating them with semicolons. Key improvements
include:
- Update grammar to require semicolons after statements
- Add Semicolon token to the tokenizer
- Implement error recovery by skipping to next semicolon on parse errors
- Create helper functions for checking semicolons in statement parsers
- Add tests for multiple statements and error conditions
This commit is contained in:
Khaïs COLIN 2025-06-03 17:53:22 +02:00
parent 28cb288eaf
commit e78511f692
Signed by: logistic-bot
SSH key fingerprint: SHA256:RlpiqKeXpcPFZZ4y9Ou4xi2M8OhRJovIwDlbCaMsuAo
11 changed files with 237 additions and 75 deletions

View file

@ -4,16 +4,18 @@ token ::= insert
| meta-command | meta-command
| int | int
| string | string
| semicolon
| end-of-file | end-of-file
/* command is second stage of parsing */ /* command is second stage of parsing */
command ::= cmd-insert command ::= cmd-insert semicolon
| cmd-select | cmd-select semicolon
cmd-insert ::= insert int string string cmd-insert ::= insert int string string
cmd-select ::= select cmd-select ::= select
insert ::= "insert" insert ::= "insert"
select ::= "select" select ::= "select"
semicolon ::= ";"
meta-command ::= "." "exit" meta-command ::= "." "exit"
| "about" | "about"

View file

@ -231,14 +231,27 @@ i will use rustyline, since it seems like the most feature-complete
* TODO .license meta-command * TODO .license meta-command
* TODO .help meta-command * TODO .help meta-command
* TODO parse insert statements in the form * DONE parse insert statements in the form
insert <id:int> <username:string> <email:string> insert <id:int> <username:string> <email:string>
** TODO Row struct ** DONE parse row insert
** TODO parse row insert * DONE separate statements with semicolons
** TODO separate statements with semicolons * TODO this error message could be better
** TODO in case of parse error, skip until next semicolon to better recover #+begin example
** TODO serialize/deserialize row to/from raw bytes Error: unexpected token
*** TODO look for best practices for creating binary formats ╭─[ <stdin>:1:24 ]
1 │ insert 0 "user" "email"
│ │
│ ╰─ found end of file ""
│ Note: expected token type to be one of ["semicolon"]
───╯
#+end example
* TODO correct all instances of <unknown> in locations
* TODO meta-commands must be followed by end-of-file
* DONE in case of parse error, skip until next semicolon to better recover
* TODO serialize/deserialize row to/from raw bytes
** TODO look for best practices for creating binary formats
* WAIT cli tests using insta-cmd * WAIT cli tests using insta-cmd
https://insta.rs/docs/cmd/ https://insta.rs/docs/cmd/

View file

@ -1,6 +1,6 @@
use crate::branding; use crate::branding;
#[derive(Debug, Eq, PartialEq)] #[derive(Debug, Eq, PartialEq, Clone)]
pub enum MetaCommand { pub enum MetaCommand {
Exit, Exit,
About, About,

View file

@ -6,40 +6,123 @@ use crate::{
tokens::{Location, Token, TokenData, tokenize}, tokens::{Location, Token, TokenData, tokenize},
}; };
// Helper function to skip tokens until reaching a semicolon or end of file
// This helps with error recovery when a statement has a syntax error
fn skip_to_next_statement(tokens: &mut VecDeque<Token>) {
while let Some(token) = tokens.front() {
match token.data {
TokenData::Semicolon | TokenData::EndOfFile => break,
_ => {
tokens.pop_front();
}
}
}
// Consume the semicolon if that's what we stopped at
if tokens
.front()
.is_some_and(|t| matches!(t.data, TokenData::Semicolon))
{
tokens.pop_front();
}
}
// Helper function to check for a semicolon after a statement
fn expect_semicolon(tokens: &mut VecDeque<crate::tokens::Token>) -> Result<(), CommandParseError> {
if let Some(next_token) = tokens.front() {
match next_token.data {
TokenData::Semicolon => {
tokens.pop_front(); // Consume the semicolon
Ok(())
}
_ => Err(CommandParseError::UnexpectedToken(
next_token.clone(),
&["semicolon"],
)),
}
} else {
// Even at the end of input, we need a semicolon
Err(CommandParseError::UnexpectedToken(
Token {
location: tokens.back().map_or_else(
|| Location::new(String::from("<unknown>"), 0, 0),
|t| t.location.clone(),
),
data: TokenData::EndOfFile,
lexeme: String::new(),
},
&["semicolon"],
))
}
}
fn parse_select_command(
tokens: &mut VecDeque<crate::tokens::Token>,
) -> Result<Command, CommandParseError> {
// Parse the select command (currently doesn't require additional tokens)
let cmd = Command::Statement(Statement::Select);
// Check for semicolon after select command
expect_semicolon(tokens)?;
Ok(cmd)
}
pub fn parse(file: String, input: String) -> Result<Vec<Command>, Vec<CommandParseError>> { pub fn parse(file: String, input: String) -> Result<Vec<Command>, Vec<CommandParseError>> {
let mut tokens: VecDeque<_> = tokenize(input, file) let mut tokens: VecDeque<_> = tokenize(input, file)
.map_err(|x| x.into_iter().map(|x| x.into()).collect::<Vec<_>>())? .map_err(|x| x.into_iter().map(|x| x.into()).collect::<Vec<_>>())?
.into(); .into();
let mut cmds = Vec::new(); let mut cmds = Vec::new();
let mut errs = Vec::new(); let mut errs = Vec::new();
while let Some(token) = tokens.pop_front() { while let Some(token) = tokens.pop_front() {
match token.data { match token.data {
crate::tokens::TokenData::Insert => match parse_insert_command(&mut tokens) { TokenData::Insert => match parse_insert_command(&mut tokens) {
Ok(cmd) => cmds.push(cmd), Ok(cmd) => cmds.push(cmd),
Err(err) => errs.push(err), Err(err) => {
errs.push(err);
skip_to_next_statement(&mut tokens); // Skip to next statement for error recovery
}
}, },
crate::tokens::TokenData::Select => cmds.push(Command::Statement(Statement::Select)), TokenData::Select => match parse_select_command(&mut tokens) {
crate::tokens::TokenData::MetaCommand(meta_command) => { Ok(cmd) => cmds.push(cmd),
cmds.push(Command::MetaCommand(meta_command)) Err(err) => {
errs.push(err);
skip_to_next_statement(&mut tokens); // Skip to next statement for error recovery
}
},
TokenData::MetaCommand(meta_command) => {
// Meta commands don't require semicolons per grammar
cmds.push(Command::MetaCommand(meta_command));
} }
crate::tokens::TokenData::Int(_) => errs.push(CommandParseError::UnexpectedToken( TokenData::Semicolon => {
token, // Empty statement (just a semicolon) - ignore it
&["statement", "meta command", "eof"], }
)), TokenData::Int(_) => {
crate::tokens::TokenData::String(_) => errs.push(CommandParseError::UnexpectedToken( errs.push(CommandParseError::UnexpectedToken(
token, token,
&["statement", "meta command", "eof"], &["statement", "meta command", "eof"],
)), ));
crate::tokens::TokenData::EndOfFile => (), skip_to_next_statement(&mut tokens);
}
TokenData::String(_) => {
errs.push(CommandParseError::UnexpectedToken(
token,
&["statement", "meta command", "eof"],
));
skip_to_next_statement(&mut tokens);
}
TokenData::EndOfFile => (), // End of parsing
} }
} }
if errs.is_empty() { Ok(cmds) } else { Err(errs) } if errs.is_empty() { Ok(cmds) } else { Err(errs) }
} }
fn parse_insert_command( fn parse_insert_command(
tokens: &mut VecDeque<crate::tokens::Token>, tokens: &mut VecDeque<crate::tokens::Token>,
) -> Result<Command, CommandParseError> { ) -> Result<Command, CommandParseError> {
// According to grammar.ebnf, insert command should be: insert int string string // According to grammar.ebnf, insert command should be: insert int string string semicolon
// Parse the id (integer) // Parse the id (integer)
let id_token = tokens.pop_front().ok_or_else(|| { let id_token = tokens.pop_front().ok_or_else(|| {
@ -106,6 +189,9 @@ fn parse_insert_command(
_ => return Err(CommandParseError::UnexpectedToken(email_token, &["string"])), _ => return Err(CommandParseError::UnexpectedToken(email_token, &["string"])),
}; };
// Check for semicolon after the insert command
expect_semicolon(tokens)?;
Ok(Command::Statement(Statement::Insert { Ok(Command::Statement(Statement::Insert {
id, id,
username, username,
@ -122,8 +208,8 @@ mod tests {
fn test_parse_single_correct() { fn test_parse_single_correct() {
let file = String::from("<stdin>"); let file = String::from("<stdin>");
assert_debug_snapshot!(parse(file.clone(), String::from(".exit"))); assert_debug_snapshot!(parse(file.clone(), String::from(".exit")));
assert_debug_snapshot!(parse(file.clone(), String::from("select"))); assert_debug_snapshot!(parse(file.clone(), String::from("select;")));
assert_debug_snapshot!(parse(file.clone(), String::from("sElEcT"))); assert_debug_snapshot!(parse(file.clone(), String::from("sElEcT;")));
} }
#[test] #[test]
@ -131,13 +217,23 @@ mod tests {
let file = String::from("<stdin>"); let file = String::from("<stdin>");
assert_debug_snapshot!(parse( assert_debug_snapshot!(parse(
file.clone(), file.clone(),
String::from(r#"insert 1 "username" "email@example.com""#) String::from(r#"insert 1 "username" "email@example.com";"#)
)); ));
assert_debug_snapshot!(parse( assert_debug_snapshot!(parse(
file.clone(), file.clone(),
String::from(r#"insert "not_an_id" "username" "email@example.com""#) String::from(r#"insert "not_an_id" "username" "email@example.com";"#)
));
assert_debug_snapshot!(parse(file.clone(), String::from(r#"insert 1 "username";"#)));
}
#[test]
fn test_parse_missing_semicolon() {
let file = String::from("<stdin>");
assert_debug_snapshot!(parse(file.clone(), String::from("select")));
assert_debug_snapshot!(parse(
file.clone(),
String::from(r#"insert 1 "username" "email@example.com""#)
)); ));
assert_debug_snapshot!(parse(file.clone(), String::from(r#"insert 1 "username""#)));
} }
#[test] #[test]
@ -155,7 +251,16 @@ mod tests {
let file = String::from("<stdin>"); let file = String::from("<stdin>");
assert_debug_snapshot!(parse( assert_debug_snapshot!(parse(
file.clone(), file.clone(),
String::from(".exit select select select") String::from(".exit select; select; select;")
));
}
#[test]
fn test_parse_multiple_statements_with_insert() {
let file = String::from("<stdin>");
assert_debug_snapshot!(parse(
file.clone(),
String::from(r#"select; insert 1 "user" "email@test.com"; select;"#)
)); ));
} }

View file

@ -1,6 +1,6 @@
--- ---
source: src/parser.rs source: src/parser.rs
expression: "parse(file.clone(),\nString::from(r#\"insert \"not_an_id\" \"username\" \"email@example.com\"\"#))" expression: "parse(file.clone(),\nString::from(r#\"insert \"not_an_id\" \"username\" \"email@example.com\";\"#))"
--- ---
Err( Err(
[ [
@ -20,41 +20,5 @@ Err(
"integer", "integer",
], ],
), ),
UnexpectedToken(
Token {
location: Location {
file: "<stdin>",
offset: 19,
length: 10,
},
data: String(
"username",
),
lexeme: "\"username\"",
},
[
"statement",
"meta command",
"eof",
],
),
UnexpectedToken(
Token {
location: Location {
file: "<stdin>",
offset: 30,
length: 19,
},
data: String(
"email@example.com",
),
lexeme: "\"email@example.com\"",
},
[
"statement",
"meta command",
"eof",
],
),
], ],
) )

View file

@ -1,6 +1,6 @@
--- ---
source: src/parser.rs source: src/parser.rs
expression: "parse(file.clone(), String::from(r#\"insert 1 \"username\"\"#))" expression: "parse(file.clone(), String::from(r#\"insert 1 \"username\";\"#))"
--- ---
Err( Err(
[ [
@ -9,10 +9,10 @@ Err(
location: Location { location: Location {
file: "<stdin>", file: "<stdin>",
offset: 19, offset: 19,
length: 0, length: 1,
}, },
data: EndOfFile, data: Semicolon,
lexeme: "", lexeme: ";",
}, },
[ [
"string", "string",

View file

@ -0,0 +1,22 @@
---
source: src/parser.rs
expression: "parse(file.clone(),\nString::from(r#\"insert 1 \"username\" \"email@example.com\"\"#))"
---
Err(
[
UnexpectedToken(
Token {
location: Location {
file: "<stdin>",
offset: 39,
length: 0,
},
data: EndOfFile,
lexeme: "",
},
[
"semicolon",
],
),
],
)

View file

@ -0,0 +1,22 @@
---
source: src/parser.rs
expression: "parse(file.clone(), String::from(\"select\"))"
---
Err(
[
UnexpectedToken(
Token {
location: Location {
file: "<stdin>",
offset: 6,
length: 0,
},
data: EndOfFile,
lexeme: "",
},
[
"semicolon",
],
),
],
)

View file

@ -0,0 +1,21 @@
---
source: src/parser.rs
expression: "parse(file.clone(),\nString::from(r#\"select; insert 1 \"user\" \"email@test.com\"; select;\"#))"
---
Ok(
[
Statement(
Select,
),
Statement(
Insert {
id: 1,
username: "user",
email: "email@test.com",
},
),
Statement(
Select,
),
],
)

View file

@ -20,7 +20,7 @@ impl Statement {
username, username,
email, email,
} => StatementExecuteResult { } => StatementExecuteResult {
msg: String::from(format!("insert {id:?} {username:?} {email:?}")), msg: format!("insert {id:?} {username:?} {email:?}"),
}, },
Statement::Select => StatementExecuteResult { Statement::Select => StatementExecuteResult {
msg: String::from("select"), msg: String::from("select"),

View file

@ -1,6 +1,6 @@
use crate::meta_commands::MetaCommand; use crate::meta_commands::MetaCommand;
#[derive(Debug, Eq, PartialEq)] #[derive(Debug, Eq, PartialEq, Clone)]
pub enum TokenData { pub enum TokenData {
Insert, Insert,
Select, Select,
@ -8,6 +8,7 @@ pub enum TokenData {
EndOfFile, EndOfFile,
Int(i64), Int(i64),
String(String), String(String),
Semicolon,
} }
#[derive(Debug, Eq, PartialEq, Clone)] #[derive(Debug, Eq, PartialEq, Clone)]
@ -46,7 +47,7 @@ impl Location {
} }
} }
#[derive(Debug, Eq, PartialEq)] #[derive(Debug, Eq, PartialEq, Clone)]
pub struct Token { pub struct Token {
/// Where in the input was this token found? /// Where in the input was this token found?
pub location: Location, pub location: Location,
@ -65,6 +66,7 @@ impl std::fmt::Display for Token {
TokenData::EndOfFile => write!(f, "end of file"), TokenData::EndOfFile => write!(f, "end of file"),
TokenData::Int(x) => write!(f, "integer {x}"), TokenData::Int(x) => write!(f, "integer {x}"),
TokenData::String(x) => write!(f, "string {x:?}"), TokenData::String(x) => write!(f, "string {x:?}"),
TokenData::Semicolon => write!(f, "semicolon"),
}?; }?;
let lexeme = &self.lexeme; let lexeme = &self.lexeme;
write!(f, " {lexeme:?}") write!(f, " {lexeme:?}")
@ -307,6 +309,15 @@ impl Tokenizer {
} }
} }
fn scan_semicolon(&mut self) -> Result<Token, ScanError> {
self.advance();
Ok(Token {
location: self.previous_location(1),
data: TokenData::Semicolon,
lexeme: String::from(";"),
})
}
fn scan_token(&mut self) -> Result<Option<Token>, ScanError> { fn scan_token(&mut self) -> Result<Option<Token>, ScanError> {
loop { loop {
if let Some(c) = self.peek() { if let Some(c) = self.peek() {
@ -318,6 +329,8 @@ impl Tokenizer {
return self.scan_integer().map(Some); return self.scan_integer().map(Some);
} else if c == '"' { } else if c == '"' {
return self.scan_string().map(Some); return self.scan_string().map(Some);
} else if c == ';' {
return self.scan_semicolon().map(Some);
} else if c.is_whitespace() { } else if c.is_whitespace() {
self.advance(); self.advance();
} else { } else {