feat(parser): implement semicolon-separated statements

Add support for semicolon-terminated statements according to the
updated grammar. This change enables executing multiple SQL statements
in a single input by separating them with semicolons. Key improvements
include:
- Update grammar to require semicolons after statements
- Add Semicolon token to the tokenizer
- Implement error recovery by skipping to next semicolon on parse errors
- Create helper functions for checking semicolons in statement parsers
- Add tests for multiple statements and error conditions
This commit is contained in:
Khaïs COLIN 2025-06-03 17:53:22 +02:00
parent 28cb288eaf
commit e78511f692
Signed by: logistic-bot
SSH key fingerprint: SHA256:RlpiqKeXpcPFZZ4y9Ou4xi2M8OhRJovIwDlbCaMsuAo
11 changed files with 237 additions and 75 deletions

View file

@ -4,16 +4,18 @@ token ::= insert
| meta-command
| int
| string
| semicolon
| end-of-file
/* command is second stage of parsing */
command ::= cmd-insert
| cmd-select
command ::= cmd-insert semicolon
| cmd-select semicolon
cmd-insert ::= insert int string string
cmd-select ::= select
insert ::= "insert"
select ::= "select"
semicolon ::= ";"
meta-command ::= "." "exit"
| "about"

View file

@ -231,14 +231,27 @@ i will use rustyline, since it seems like the most feature-complete
* TODO .license meta-command
* TODO .help meta-command
* TODO parse insert statements in the form
* DONE parse insert statements in the form
insert <id:int> <username:string> <email:string>
** TODO Row struct
** TODO parse row insert
** TODO separate statements with semicolons
** TODO in case of parse error, skip until next semicolon to better recover
** TODO serialize/deserialize row to/from raw bytes
*** TODO look for best practices for creating binary formats
** DONE parse row insert
* DONE separate statements with semicolons
* TODO this error message could be better
#+begin example
Error: unexpected token
╭─[ <stdin>:1:24 ]
1 │ insert 0 "user" "email"
│ │
│ ╰─ found end of file ""
│ Note: expected token type to be one of ["semicolon"]
───╯
#+end example
* TODO correct all instances of <unknown> in locations
* TODO meta-commands must be followed by end-of-file
* DONE in case of parse error, skip until next semicolon to better recover
* TODO serialize/deserialize row to/from raw bytes
** TODO look for best practices for creating binary formats
* WAIT cli tests using insta-cmd
https://insta.rs/docs/cmd/

View file

@ -1,6 +1,6 @@
use crate::branding;
#[derive(Debug, Eq, PartialEq)]
#[derive(Debug, Eq, PartialEq, Clone)]
pub enum MetaCommand {
Exit,
About,

View file

@ -6,40 +6,123 @@ use crate::{
tokens::{Location, Token, TokenData, tokenize},
};
// Helper function to skip tokens until reaching a semicolon or end of file
// This helps with error recovery when a statement has a syntax error
fn skip_to_next_statement(tokens: &mut VecDeque<Token>) {
while let Some(token) = tokens.front() {
match token.data {
TokenData::Semicolon | TokenData::EndOfFile => break,
_ => {
tokens.pop_front();
}
}
}
// Consume the semicolon if that's what we stopped at
if tokens
.front()
.is_some_and(|t| matches!(t.data, TokenData::Semicolon))
{
tokens.pop_front();
}
}
// Helper function to check for a semicolon after a statement
fn expect_semicolon(tokens: &mut VecDeque<crate::tokens::Token>) -> Result<(), CommandParseError> {
if let Some(next_token) = tokens.front() {
match next_token.data {
TokenData::Semicolon => {
tokens.pop_front(); // Consume the semicolon
Ok(())
}
_ => Err(CommandParseError::UnexpectedToken(
next_token.clone(),
&["semicolon"],
)),
}
} else {
// Even at the end of input, we need a semicolon
Err(CommandParseError::UnexpectedToken(
Token {
location: tokens.back().map_or_else(
|| Location::new(String::from("<unknown>"), 0, 0),
|t| t.location.clone(),
),
data: TokenData::EndOfFile,
lexeme: String::new(),
},
&["semicolon"],
))
}
}
fn parse_select_command(
tokens: &mut VecDeque<crate::tokens::Token>,
) -> Result<Command, CommandParseError> {
// Parse the select command (currently doesn't require additional tokens)
let cmd = Command::Statement(Statement::Select);
// Check for semicolon after select command
expect_semicolon(tokens)?;
Ok(cmd)
}
pub fn parse(file: String, input: String) -> Result<Vec<Command>, Vec<CommandParseError>> {
let mut tokens: VecDeque<_> = tokenize(input, file)
.map_err(|x| x.into_iter().map(|x| x.into()).collect::<Vec<_>>())?
.into();
let mut cmds = Vec::new();
let mut errs = Vec::new();
while let Some(token) = tokens.pop_front() {
match token.data {
crate::tokens::TokenData::Insert => match parse_insert_command(&mut tokens) {
TokenData::Insert => match parse_insert_command(&mut tokens) {
Ok(cmd) => cmds.push(cmd),
Err(err) => errs.push(err),
Err(err) => {
errs.push(err);
skip_to_next_statement(&mut tokens); // Skip to next statement for error recovery
}
},
crate::tokens::TokenData::Select => cmds.push(Command::Statement(Statement::Select)),
crate::tokens::TokenData::MetaCommand(meta_command) => {
cmds.push(Command::MetaCommand(meta_command))
TokenData::Select => match parse_select_command(&mut tokens) {
Ok(cmd) => cmds.push(cmd),
Err(err) => {
errs.push(err);
skip_to_next_statement(&mut tokens); // Skip to next statement for error recovery
}
},
TokenData::MetaCommand(meta_command) => {
// Meta commands don't require semicolons per grammar
cmds.push(Command::MetaCommand(meta_command));
}
crate::tokens::TokenData::Int(_) => errs.push(CommandParseError::UnexpectedToken(
token,
&["statement", "meta command", "eof"],
)),
crate::tokens::TokenData::String(_) => errs.push(CommandParseError::UnexpectedToken(
token,
&["statement", "meta command", "eof"],
)),
crate::tokens::TokenData::EndOfFile => (),
TokenData::Semicolon => {
// Empty statement (just a semicolon) - ignore it
}
TokenData::Int(_) => {
errs.push(CommandParseError::UnexpectedToken(
token,
&["statement", "meta command", "eof"],
));
skip_to_next_statement(&mut tokens);
}
TokenData::String(_) => {
errs.push(CommandParseError::UnexpectedToken(
token,
&["statement", "meta command", "eof"],
));
skip_to_next_statement(&mut tokens);
}
TokenData::EndOfFile => (), // End of parsing
}
}
if errs.is_empty() { Ok(cmds) } else { Err(errs) }
}
fn parse_insert_command(
tokens: &mut VecDeque<crate::tokens::Token>,
) -> Result<Command, CommandParseError> {
// According to grammar.ebnf, insert command should be: insert int string string
// According to grammar.ebnf, insert command should be: insert int string string semicolon
// Parse the id (integer)
let id_token = tokens.pop_front().ok_or_else(|| {
@ -106,6 +189,9 @@ fn parse_insert_command(
_ => return Err(CommandParseError::UnexpectedToken(email_token, &["string"])),
};
// Check for semicolon after the insert command
expect_semicolon(tokens)?;
Ok(Command::Statement(Statement::Insert {
id,
username,
@ -122,8 +208,8 @@ mod tests {
fn test_parse_single_correct() {
let file = String::from("<stdin>");
assert_debug_snapshot!(parse(file.clone(), String::from(".exit")));
assert_debug_snapshot!(parse(file.clone(), String::from("select")));
assert_debug_snapshot!(parse(file.clone(), String::from("sElEcT")));
assert_debug_snapshot!(parse(file.clone(), String::from("select;")));
assert_debug_snapshot!(parse(file.clone(), String::from("sElEcT;")));
}
#[test]
@ -131,13 +217,23 @@ mod tests {
let file = String::from("<stdin>");
assert_debug_snapshot!(parse(
file.clone(),
String::from(r#"insert 1 "username" "email@example.com""#)
String::from(r#"insert 1 "username" "email@example.com";"#)
));
assert_debug_snapshot!(parse(
file.clone(),
String::from(r#"insert "not_an_id" "username" "email@example.com""#)
String::from(r#"insert "not_an_id" "username" "email@example.com";"#)
));
assert_debug_snapshot!(parse(file.clone(), String::from(r#"insert 1 "username";"#)));
}
#[test]
fn test_parse_missing_semicolon() {
let file = String::from("<stdin>");
assert_debug_snapshot!(parse(file.clone(), String::from("select")));
assert_debug_snapshot!(parse(
file.clone(),
String::from(r#"insert 1 "username" "email@example.com""#)
));
assert_debug_snapshot!(parse(file.clone(), String::from(r#"insert 1 "username""#)));
}
#[test]
@ -155,7 +251,16 @@ mod tests {
let file = String::from("<stdin>");
assert_debug_snapshot!(parse(
file.clone(),
String::from(".exit select select select")
String::from(".exit select; select; select;")
));
}
#[test]
fn test_parse_multiple_statements_with_insert() {
let file = String::from("<stdin>");
assert_debug_snapshot!(parse(
file.clone(),
String::from(r#"select; insert 1 "user" "email@test.com"; select;"#)
));
}

View file

@ -1,6 +1,6 @@
---
source: src/parser.rs
expression: "parse(file.clone(),\nString::from(r#\"insert \"not_an_id\" \"username\" \"email@example.com\"\"#))"
expression: "parse(file.clone(),\nString::from(r#\"insert \"not_an_id\" \"username\" \"email@example.com\";\"#))"
---
Err(
[
@ -20,41 +20,5 @@ Err(
"integer",
],
),
UnexpectedToken(
Token {
location: Location {
file: "<stdin>",
offset: 19,
length: 10,
},
data: String(
"username",
),
lexeme: "\"username\"",
},
[
"statement",
"meta command",
"eof",
],
),
UnexpectedToken(
Token {
location: Location {
file: "<stdin>",
offset: 30,
length: 19,
},
data: String(
"email@example.com",
),
lexeme: "\"email@example.com\"",
},
[
"statement",
"meta command",
"eof",
],
),
],
)

View file

@ -1,6 +1,6 @@
---
source: src/parser.rs
expression: "parse(file.clone(), String::from(r#\"insert 1 \"username\"\"#))"
expression: "parse(file.clone(), String::from(r#\"insert 1 \"username\";\"#))"
---
Err(
[
@ -9,10 +9,10 @@ Err(
location: Location {
file: "<stdin>",
offset: 19,
length: 0,
length: 1,
},
data: EndOfFile,
lexeme: "",
data: Semicolon,
lexeme: ";",
},
[
"string",

View file

@ -0,0 +1,22 @@
---
source: src/parser.rs
expression: "parse(file.clone(),\nString::from(r#\"insert 1 \"username\" \"email@example.com\"\"#))"
---
Err(
[
UnexpectedToken(
Token {
location: Location {
file: "<stdin>",
offset: 39,
length: 0,
},
data: EndOfFile,
lexeme: "",
},
[
"semicolon",
],
),
],
)

View file

@ -0,0 +1,22 @@
---
source: src/parser.rs
expression: "parse(file.clone(), String::from(\"select\"))"
---
Err(
[
UnexpectedToken(
Token {
location: Location {
file: "<stdin>",
offset: 6,
length: 0,
},
data: EndOfFile,
lexeme: "",
},
[
"semicolon",
],
),
],
)

View file

@ -0,0 +1,21 @@
---
source: src/parser.rs
expression: "parse(file.clone(),\nString::from(r#\"select; insert 1 \"user\" \"email@test.com\"; select;\"#))"
---
Ok(
[
Statement(
Select,
),
Statement(
Insert {
id: 1,
username: "user",
email: "email@test.com",
},
),
Statement(
Select,
),
],
)

View file

@ -20,7 +20,7 @@ impl Statement {
username,
email,
} => StatementExecuteResult {
msg: String::from(format!("insert {id:?} {username:?} {email:?}")),
msg: format!("insert {id:?} {username:?} {email:?}"),
},
Statement::Select => StatementExecuteResult {
msg: String::from("select"),

View file

@ -1,6 +1,6 @@
use crate::meta_commands::MetaCommand;
#[derive(Debug, Eq, PartialEq)]
#[derive(Debug, Eq, PartialEq, Clone)]
pub enum TokenData {
Insert,
Select,
@ -8,6 +8,7 @@ pub enum TokenData {
EndOfFile,
Int(i64),
String(String),
Semicolon,
}
#[derive(Debug, Eq, PartialEq, Clone)]
@ -46,7 +47,7 @@ impl Location {
}
}
#[derive(Debug, Eq, PartialEq)]
#[derive(Debug, Eq, PartialEq, Clone)]
pub struct Token {
/// Where in the input was this token found?
pub location: Location,
@ -65,6 +66,7 @@ impl std::fmt::Display for Token {
TokenData::EndOfFile => write!(f, "end of file"),
TokenData::Int(x) => write!(f, "integer {x}"),
TokenData::String(x) => write!(f, "string {x:?}"),
TokenData::Semicolon => write!(f, "semicolon"),
}?;
let lexeme = &self.lexeme;
write!(f, " {lexeme:?}")
@ -307,6 +309,15 @@ impl Tokenizer {
}
}
fn scan_semicolon(&mut self) -> Result<Token, ScanError> {
self.advance();
Ok(Token {
location: self.previous_location(1),
data: TokenData::Semicolon,
lexeme: String::from(";"),
})
}
fn scan_token(&mut self) -> Result<Option<Token>, ScanError> {
loop {
if let Some(c) = self.peek() {
@ -318,6 +329,8 @@ impl Tokenizer {
return self.scan_integer().map(Some);
} else if c == '"' {
return self.scan_string().map(Some);
} else if c == ';' {
return self.scan_semicolon().map(Some);
} else if c.is_whitespace() {
self.advance();
} else {