From 71a9d82d963481c040331274de8ee132d88e9e82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kha=C3=AFs=20COLIN?= Date: Sat, 10 May 2025 10:55:44 +0200 Subject: [PATCH 1/3] feat(tokenizer): add Token::Int & UnexpectedToken error type --- notes.org | 6 +++++- src/command.rs | 3 ++- src/error_display.rs | 8 ++++++-- src/parser.rs | 12 ++++++++++-- src/tokens.rs | 1 + 5 files changed, 24 insertions(+), 6 deletions(-) diff --git a/notes.org b/notes.org index 9e6c400..1d0c993 100644 --- a/notes.org +++ b/notes.org @@ -212,7 +212,11 @@ i will use rustyline, since it seems like the most feature-complete * DONE remove uneeded error variants -* TODO cli tests using insta-cmd +* STRT parse integers + +* TODO parse strings + +* WAIT cli tests using insta-cmd https://insta.rs/docs/cmd/ * WAIT autocompletion diff --git a/src/command.rs b/src/command.rs index c325db6..75c2792 100644 --- a/src/command.rs +++ b/src/command.rs @@ -1,6 +1,6 @@ use crate::meta_commands::{MetaCommand, MetaCommandExecuteResult}; use crate::statements::{Statement, StatementExecuteResult}; -use crate::tokens::ScanError; +use crate::tokens::{ScanError, Token}; #[derive(Debug)] pub enum Command { @@ -50,6 +50,7 @@ impl Command { #[derive(Debug)] pub enum CommandParseError { Scan(ScanError), + UnexpectedToken(Token, &'static [&'static str]), } impl From for Command { diff --git a/src/error_display.rs b/src/error_display.rs index fcb6468..5297abb 100644 --- a/src/error_display.rs +++ b/src/error_display.rs @@ -7,8 +7,12 @@ pub trait OSDBError { impl OSDBError for CommandParseError { fn display(&self, file: &str, input: &str) { - let CommandParseError::Scan(x) = self; - x.display(file, input); + match self { + CommandParseError::Scan(x) => { + x.display(file, input); + } + _ => todo!(), + } } } diff --git a/src/parser.rs b/src/parser.rs index 98325d8..e97d474 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -11,7 +11,7 @@ pub fn parse(file: String, input: String) -> Result, Vec>())? .into(); let mut cmds = Vec::new(); - let errs = Vec::new(); + let mut errs = Vec::new(); while let Some(token) = tokens.pop_front() { match token.data { crate::tokens::TokenData::Insert => cmds.push(Command::Statement(Statement::Insert)), @@ -19,10 +19,18 @@ pub fn parse(file: String, input: String) -> Result, Vec { cmds.push(Command::MetaCommand(meta_command)) } + crate::tokens::TokenData::Int(_) => errs.push(CommandParseError::UnexpectedToken( + token, + &["statement", "meta command", "eof"], + )), crate::tokens::TokenData::EndOfFile => (), } } - if errs.is_empty() { Ok(cmds) } else { Err(errs) } + if errs.is_empty() { + Ok(cmds) + } else { + Err(errs) + } } #[cfg(test)] diff --git a/src/tokens.rs b/src/tokens.rs index 0c5c1f7..958a254 100644 --- a/src/tokens.rs +++ b/src/tokens.rs @@ -6,6 +6,7 @@ pub enum TokenData { Select, MetaCommand(MetaCommand), EndOfFile, + Int(i64), } #[derive(Debug, Eq, PartialEq)] From f259b079b7655b7075c3d0d071969a76f1bd24cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kha=C3=AFs=20COLIN?= Date: Sat, 10 May 2025 10:55:44 +0200 Subject: [PATCH 2/3] feat(tokenizer): parse integers --- notes.org | 4 ++ src/error_display.rs | 16 ++++- src/meta_commands.rs | 8 +++ ...osdb__tokens__tests__tokenizer_errors.snap | 46 ++++++------ ...__tokens__tests__tokenizer_integers-2.snap | 28 ++++++++ ...__tokens__tests__tokenizer_integers-3.snap | 28 ++++++++ ...__tokens__tests__tokenizer_integers-4.snap | 28 ++++++++ ...__tokens__tests__tokenizer_integers-5.snap | 20 ++++++ ...__tokens__tests__tokenizer_integers-6.snap | 20 ++++++ ...__tokens__tests__tokenizer_integers-7.snap | 20 ++++++ ...__tokens__tests__tokenizer_integers-8.snap | 20 ++++++ ...db__tokens__tests__tokenizer_integers.snap | 28 ++++++++ src/tokens.rs | 72 +++++++++++++++++-- 13 files changed, 311 insertions(+), 27 deletions(-) create mode 100644 src/snapshots/osdb__tokens__tests__tokenizer_integers-2.snap create mode 100644 src/snapshots/osdb__tokens__tests__tokenizer_integers-3.snap create mode 100644 src/snapshots/osdb__tokens__tests__tokenizer_integers-4.snap create mode 100644 src/snapshots/osdb__tokens__tests__tokenizer_integers-5.snap create mode 100644 src/snapshots/osdb__tokens__tests__tokenizer_integers-6.snap create mode 100644 src/snapshots/osdb__tokens__tests__tokenizer_integers-7.snap create mode 100644 src/snapshots/osdb__tokens__tests__tokenizer_integers-8.snap create mode 100644 src/snapshots/osdb__tokens__tests__tokenizer_integers.snap diff --git a/notes.org b/notes.org index 1d0c993..8813f59 100644 --- a/notes.org +++ b/notes.org @@ -214,6 +214,10 @@ i will use rustyline, since it seems like the most feature-complete * STRT parse integers +** TODO Function to get a token until condition is false + +** TODO Parse the integer + * TODO parse strings * WAIT cli tests using insta-cmd diff --git a/src/error_display.rs b/src/error_display.rs index 5297abb..cd303b6 100644 --- a/src/error_display.rs +++ b/src/error_display.rs @@ -11,7 +11,20 @@ impl OSDBError for CommandParseError { CommandParseError::Scan(x) => { x.display(file, input); } - _ => todo!(), + CommandParseError::UnexpectedToken(token, items) => { + let location = (file, Into::>::into(&token.location)); + Report::build(ReportKind::Error, location.clone()) + .with_message("unexpected token") + .with_label( + Label::new(location.clone()) + .with_color(Color::Red) + .with_message(format!("found {token}")), + ) + .with_note(format!("expected token type to be one of {items:?}")) + .finish() + .print((file, Source::from(input))) + .unwrap() + } } } } @@ -26,7 +39,6 @@ impl OSDBError for ScanError { .with_color(Color::Red) .with_message(format!("{self}")), ) - .with_help("Make sure you don't have any typos or unexpected characters.") .finish() .print((file, Source::from(input))) .unwrap(); diff --git a/src/meta_commands.rs b/src/meta_commands.rs index 898776c..59ff37f 100644 --- a/src/meta_commands.rs +++ b/src/meta_commands.rs @@ -3,6 +3,14 @@ pub enum MetaCommand { Exit, } +impl std::fmt::Display for MetaCommand { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + MetaCommand::Exit => write!(f, "exit"), + } + } +} + pub struct MetaCommandExecuteResult { pub should_exit: bool, } diff --git a/src/snapshots/osdb__tokens__tests__tokenizer_errors.snap b/src/snapshots/osdb__tokens__tests__tokenizer_errors.snap index b5886f1..347e0ca 100644 --- a/src/snapshots/osdb__tokens__tests__tokenizer_errors.snap +++ b/src/snapshots/osdb__tokens__tests__tokenizer_errors.snap @@ -1,26 +1,30 @@ --- source: src/tokens.rs -expression: scanerrors +expression: "tokenize(\"salact +\".to_string(), \"src/statement.sql\".to_string())" --- -[ - ScanError { - location: Location { - file: "src/statement.sql", - offset: 0, - length: 6, +Err( + [ + ScanError { + location: Location { + file: "src/statement.sql", + offset: 0, + length: 6, + }, + kind: UnknownKeyword( + "salact", + ), }, - kind: UnknownKeyword( - "salact", - ), - }, - ScanError { - location: Location { - file: "src/statement.sql", - offset: 7, - length: 1, + ScanError { + location: Location { + file: "src/statement.sql", + offset: 7, + length: 1, + }, + kind: ParseIntError( + ParseIntError { + kind: InvalidDigit, + }, + ), }, - kind: UnexpectedChar( - '+', - ), - }, -] + ], +) diff --git a/src/snapshots/osdb__tokens__tests__tokenizer_integers-2.snap b/src/snapshots/osdb__tokens__tests__tokenizer_integers-2.snap new file mode 100644 index 0000000..9b27ecf --- /dev/null +++ b/src/snapshots/osdb__tokens__tests__tokenizer_integers-2.snap @@ -0,0 +1,28 @@ +--- +source: src/tokens.rs +expression: "tokenize(\"-10\".to_string(), \"src/ints.sql\".to_string(),)" +--- +Ok( + [ + Token { + location: Location { + file: "src/ints.sql", + offset: 0, + length: 3, + }, + data: Int( + -10, + ), + lexeme: "-10", + }, + Token { + location: Location { + file: "src/ints.sql", + offset: 3, + length: 0, + }, + data: EndOfFile, + lexeme: "", + }, + ], +) diff --git a/src/snapshots/osdb__tokens__tests__tokenizer_integers-3.snap b/src/snapshots/osdb__tokens__tests__tokenizer_integers-3.snap new file mode 100644 index 0000000..d125fbb --- /dev/null +++ b/src/snapshots/osdb__tokens__tests__tokenizer_integers-3.snap @@ -0,0 +1,28 @@ +--- +source: src/tokens.rs +expression: "tokenize(\"0\".to_string(), \"src/ints.sql\".to_string(),)" +--- +Ok( + [ + Token { + location: Location { + file: "src/ints.sql", + offset: 0, + length: 1, + }, + data: Int( + 0, + ), + lexeme: "0", + }, + Token { + location: Location { + file: "src/ints.sql", + offset: 1, + length: 0, + }, + data: EndOfFile, + lexeme: "", + }, + ], +) diff --git a/src/snapshots/osdb__tokens__tests__tokenizer_integers-4.snap b/src/snapshots/osdb__tokens__tests__tokenizer_integers-4.snap new file mode 100644 index 0000000..ea93fa4 --- /dev/null +++ b/src/snapshots/osdb__tokens__tests__tokenizer_integers-4.snap @@ -0,0 +1,28 @@ +--- +source: src/tokens.rs +expression: "tokenize(\"-0\".to_string(), \"src/ints.sql\".to_string(),)" +--- +Ok( + [ + Token { + location: Location { + file: "src/ints.sql", + offset: 0, + length: 2, + }, + data: Int( + 0, + ), + lexeme: "-0", + }, + Token { + location: Location { + file: "src/ints.sql", + offset: 2, + length: 0, + }, + data: EndOfFile, + lexeme: "", + }, + ], +) diff --git a/src/snapshots/osdb__tokens__tests__tokenizer_integers-5.snap b/src/snapshots/osdb__tokens__tests__tokenizer_integers-5.snap new file mode 100644 index 0000000..31bb3b8 --- /dev/null +++ b/src/snapshots/osdb__tokens__tests__tokenizer_integers-5.snap @@ -0,0 +1,20 @@ +--- +source: src/tokens.rs +expression: "tokenize(\"--0\".to_string(), \"src/ints.sql\".to_string(),)" +--- +Err( + [ + ScanError { + location: Location { + file: "src/ints.sql", + offset: 0, + length: 3, + }, + kind: ParseIntError( + ParseIntError { + kind: InvalidDigit, + }, + ), + }, + ], +) diff --git a/src/snapshots/osdb__tokens__tests__tokenizer_integers-6.snap b/src/snapshots/osdb__tokens__tests__tokenizer_integers-6.snap new file mode 100644 index 0000000..75c6381 --- /dev/null +++ b/src/snapshots/osdb__tokens__tests__tokenizer_integers-6.snap @@ -0,0 +1,20 @@ +--- +source: src/tokens.rs +expression: "tokenize(\"++0\".to_string(), \"src/ints.sql\".to_string(),)" +--- +Err( + [ + ScanError { + location: Location { + file: "src/ints.sql", + offset: 0, + length: 3, + }, + kind: ParseIntError( + ParseIntError { + kind: InvalidDigit, + }, + ), + }, + ], +) diff --git a/src/snapshots/osdb__tokens__tests__tokenizer_integers-7.snap b/src/snapshots/osdb__tokens__tests__tokenizer_integers-7.snap new file mode 100644 index 0000000..daf026f --- /dev/null +++ b/src/snapshots/osdb__tokens__tests__tokenizer_integers-7.snap @@ -0,0 +1,20 @@ +--- +source: src/tokens.rs +expression: "tokenize(\"-\".to_string(), \"src/ints.sql\".to_string(),)" +--- +Err( + [ + ScanError { + location: Location { + file: "src/ints.sql", + offset: 0, + length: 1, + }, + kind: ParseIntError( + ParseIntError { + kind: InvalidDigit, + }, + ), + }, + ], +) diff --git a/src/snapshots/osdb__tokens__tests__tokenizer_integers-8.snap b/src/snapshots/osdb__tokens__tests__tokenizer_integers-8.snap new file mode 100644 index 0000000..43bb23c --- /dev/null +++ b/src/snapshots/osdb__tokens__tests__tokenizer_integers-8.snap @@ -0,0 +1,20 @@ +--- +source: src/tokens.rs +expression: "tokenize(\"+\".to_string(), \"src/ints.sql\".to_string(),)" +--- +Err( + [ + ScanError { + location: Location { + file: "src/ints.sql", + offset: 0, + length: 1, + }, + kind: ParseIntError( + ParseIntError { + kind: InvalidDigit, + }, + ), + }, + ], +) diff --git a/src/snapshots/osdb__tokens__tests__tokenizer_integers.snap b/src/snapshots/osdb__tokens__tests__tokenizer_integers.snap new file mode 100644 index 0000000..96d83a3 --- /dev/null +++ b/src/snapshots/osdb__tokens__tests__tokenizer_integers.snap @@ -0,0 +1,28 @@ +--- +source: src/tokens.rs +expression: "tokenize(\"10\".to_string(), \"src/ints.sql\".to_string(),)" +--- +Ok( + [ + Token { + location: Location { + file: "src/ints.sql", + offset: 0, + length: 2, + }, + data: Int( + 10, + ), + lexeme: "10", + }, + Token { + location: Location { + file: "src/ints.sql", + offset: 2, + length: 0, + }, + data: EndOfFile, + lexeme: "", + }, + ], +) diff --git a/src/tokens.rs b/src/tokens.rs index 958a254..7c44906 100644 --- a/src/tokens.rs +++ b/src/tokens.rs @@ -55,6 +55,20 @@ pub struct Token { pub lexeme: String, } +impl std::fmt::Display for Token { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match &self.data { + TokenData::Insert => write!(f, "insert statement"), + TokenData::Select => write!(f, "select statement"), + TokenData::MetaCommand(x) => write!(f, "meta-command {x}"), + TokenData::EndOfFile => write!(f, "end of file"), + TokenData::Int(x) => write!(f, "integer {x}"), + }?; + let lexeme = &self.lexeme; + write!(f, " {lexeme:?}") + } +} + struct Tokenizer { input: String, file: String, @@ -68,6 +82,7 @@ pub enum ScanErrorKind { UnexpectedEndOfInput, UnknownKeyword(String), UnknownMetaCommand(String), + ParseIntError(std::num::ParseIntError), } impl std::fmt::Display for ScanErrorKind { @@ -77,6 +92,7 @@ impl std::fmt::Display for ScanErrorKind { ScanErrorKind::UnexpectedEndOfInput => write!(f, "unexpected end of input"), ScanErrorKind::UnknownKeyword(x) => write!(f, "unknown keyword: {x:?}"), ScanErrorKind::UnknownMetaCommand(x) => write!(f, "unknown meta-command: {x:?}"), + ScanErrorKind::ParseIntError(x) => write!(f, "failed to parse integer: {x}"), } } } @@ -207,6 +223,40 @@ impl Tokenizer { c.is_alphanumeric() || c == '_' } + fn digit(c: char) -> bool { + c.is_ascii_digit() || c == '-' || c == '+' + } + + fn scan_integer(&mut self) -> Result { + let start_offset = self.offset; + let mut word = String::new(); + let mut length = 0; + if let Some(c) = self.advance() { + word.push(c); + length += 1; + } + while let Some(c) = self.peek() { + if Self::digit(c) { + word.push(c); + self.advance(); + } else { + break; + } + length += 1; + } + match word.parse::() { + Ok(int) => Ok(Token { + location: Location::new(self.file.clone(), start_offset, length), + data: TokenData::Int(int), + lexeme: word, + }), + Err(e) => Err(ScanError { + location: Location::new(self.file.clone(), start_offset, length), + kind: ScanErrorKind::ParseIntError(e), + }), + } + } + fn scan_token(&mut self) -> Result, ScanError> { loop { if let Some(c) = self.peek() { @@ -214,6 +264,8 @@ impl Tokenizer { return self.scan_identifier_or_keyword().map(Some); } else if c == '.' { return self.scan_meta_command().map(Some); + } else if Self::digit(c) { + return self.scan_integer().map(Some); } else if c.is_whitespace() { self.advance(); } else { @@ -308,9 +360,21 @@ mod tests { #[test] fn test_tokenizer_errors() { - let scanerrors = tokenize("salact +".to_string(), "src/statement.sql".to_string()) - .err() - .unwrap(); - assert_debug_snapshot!(scanerrors); + assert_debug_snapshot!(tokenize( + "salact +".to_string(), + "src/statement.sql".to_string() + )); + } + + #[test] + fn test_tokenizer_integers() { + assert_debug_snapshot!(tokenize("10".to_string(), "src/ints.sql".to_string(),)); + assert_debug_snapshot!(tokenize("-10".to_string(), "src/ints.sql".to_string(),)); + assert_debug_snapshot!(tokenize("0".to_string(), "src/ints.sql".to_string(),)); + assert_debug_snapshot!(tokenize("-0".to_string(), "src/ints.sql".to_string(),)); + assert_debug_snapshot!(tokenize("--0".to_string(), "src/ints.sql".to_string(),)); + assert_debug_snapshot!(tokenize("++0".to_string(), "src/ints.sql".to_string(),)); + assert_debug_snapshot!(tokenize("-".to_string(), "src/ints.sql".to_string(),)); + assert_debug_snapshot!(tokenize("+".to_string(), "src/ints.sql".to_string(),)); } } From 00e9bc3b6005fbcc27cbe466e1c68912e09f3872 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kha=C3=AFs=20COLIN?= Date: Sat, 24 May 2025 13:28:56 +0200 Subject: [PATCH 3/3] feat(tokenizer): string tokenizing --- notes.org | 17 +++++++++++++---- src/cli.rs | 6 ++++-- src/parser.rs | 10 +++++----- src/tokens.rs | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 72 insertions(+), 11 deletions(-) diff --git a/notes.org b/notes.org index 8813f59..a00857a 100644 --- a/notes.org +++ b/notes.org @@ -212,13 +212,22 @@ i will use rustyline, since it seems like the most feature-complete * DONE remove uneeded error variants -* STRT parse integers +* DONE parse integers -** TODO Function to get a token until condition is false +** DONE Function to get a token until condition is false -** TODO Parse the integer +** DONE Parse the integer -* TODO parse strings +* DONE parse strings + +* TODO better error message display for unclosed " in string + +* TODO parse insert statements in the form +insert +** TODO Row struct +** TODO parse row insert +** TODO serialize/deserialize row to/from raw bytes +*** TODO look for best practices for creating binary formats * WAIT cli tests using insta-cmd https://insta.rs/docs/cmd/ diff --git a/src/cli.rs b/src/cli.rs index 4aa33d0..b2fc3bc 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,6 +1,6 @@ use std::path::PathBuf; -use rustyline::{history::FileHistory, Editor}; +use rustyline::{Editor, history::FileHistory}; fn xdg_state_dir() -> Option { if let Ok(dir) = std::env::var("XDG_STATE_DIR") { @@ -31,7 +31,9 @@ pub fn history_file() -> Option { Some(state.join("cli_history")) } else { eprintln!("Warning: failed to find or create XDG_STATE_DIR for osdb."); - eprintln!("Warning: either set XDG_STATE_DIR or HOME, and ensure osdb has write permissions to that directory."); + eprintln!( + "Warning: either set XDG_STATE_DIR or HOME, and ensure osdb has write permissions to that directory." + ); None } } diff --git a/src/parser.rs b/src/parser.rs index e97d474..7cff3b0 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -23,14 +23,14 @@ pub fn parse(file: String, input: String) -> Result, Vec errs.push(CommandParseError::UnexpectedToken( + token, + &["statement", "meta command", "eof"], + )), crate::tokens::TokenData::EndOfFile => (), } } - if errs.is_empty() { - Ok(cmds) - } else { - Err(errs) - } + if errs.is_empty() { Ok(cmds) } else { Err(errs) } } #[cfg(test)] diff --git a/src/tokens.rs b/src/tokens.rs index 7c44906..b810d17 100644 --- a/src/tokens.rs +++ b/src/tokens.rs @@ -7,6 +7,7 @@ pub enum TokenData { MetaCommand(MetaCommand), EndOfFile, Int(i64), + String(String), } #[derive(Debug, Eq, PartialEq)] @@ -63,6 +64,7 @@ impl std::fmt::Display for Token { TokenData::MetaCommand(x) => write!(f, "meta-command {x}"), TokenData::EndOfFile => write!(f, "end of file"), TokenData::Int(x) => write!(f, "integer {x}"), + TokenData::String(x) => write!(f, "string {x:?}"), }?; let lexeme = &self.lexeme; write!(f, " {lexeme:?}") @@ -83,6 +85,7 @@ pub enum ScanErrorKind { UnknownKeyword(String), UnknownMetaCommand(String), ParseIntError(std::num::ParseIntError), + UnexpectedEndOfInputWhileLookingForMatching(char, Location), } impl std::fmt::Display for ScanErrorKind { @@ -93,6 +96,10 @@ impl std::fmt::Display for ScanErrorKind { ScanErrorKind::UnknownKeyword(x) => write!(f, "unknown keyword: {x:?}"), ScanErrorKind::UnknownMetaCommand(x) => write!(f, "unknown meta-command: {x:?}"), ScanErrorKind::ParseIntError(x) => write!(f, "failed to parse integer: {x}"), + ScanErrorKind::UnexpectedEndOfInputWhileLookingForMatching(c, _) => write!( + f, + "unexpected end of input while looking for matching {c:?}" + ), } } } @@ -124,6 +131,10 @@ impl Tokenizer { Location::new(self.file.clone(), self.offset, length) } + fn previous_location(&self, length: usize) -> Location { + Location::new(self.file.clone(), self.offset - 1, length) + } + fn is_at_end(&self) -> bool { self.offset >= self.input.len() } @@ -257,6 +268,43 @@ impl Tokenizer { } } + fn scan_string(&mut self) -> Result { + let start_offset = self.offset; + let mut word = String::new(); + let mut lexeme = String::new(); + let mut length = 0; + let mut valid = false; + if let Some(c) = self.advance() { + lexeme.push(c); + length += 1; + } + while let Some(c) = self.advance() { + lexeme.push(c); + length += 1; + if c == '"' { + valid = true; + break; + } else { + word.push(c); + } + } + if valid { + Ok(Token { + location: Location::new(self.file.clone(), start_offset, length), + data: TokenData::String(word), + lexeme, + }) + } else { + Err(ScanError { + location: self.previous_location(0), + kind: ScanErrorKind::UnexpectedEndOfInputWhileLookingForMatching( + '"', + Location::new(self.file.clone(), start_offset, 1), + ), + }) + } + } + fn scan_token(&mut self) -> Result, ScanError> { loop { if let Some(c) = self.peek() { @@ -266,6 +314,8 @@ impl Tokenizer { return self.scan_meta_command().map(Some); } else if Self::digit(c) { return self.scan_integer().map(Some); + } else if c == '"' { + return self.scan_string().map(Some); } else if c.is_whitespace() { self.advance(); } else {