From 47a68e5069c927ecaba6680c43423950fb639c85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kha=C3=AFs=20COLIN?= Date: Wed, 4 Jun 2025 20:44:12 +0200 Subject: [PATCH] docs: add code comments and test cases for tokenization --- .gitignore | 1 + .../osdb__tokens__tests__mixed_input.snap | 88 +++++++++ .../osdb__tokens__tests__string_errors-2.snap | 23 +++ .../osdb__tokens__tests__string_errors.snap | 23 +++ ...b__tokens__tests__whitespace_handling.snap | 46 +++++ src/tokens.rs | 176 +++++++++++++++++- 6 files changed, 347 insertions(+), 10 deletions(-) create mode 100644 src/snapshots/osdb__tokens__tests__mixed_input.snap create mode 100644 src/snapshots/osdb__tokens__tests__string_errors-2.snap create mode 100644 src/snapshots/osdb__tokens__tests__string_errors.snap create mode 100644 src/snapshots/osdb__tokens__tests__whitespace_handling.snap diff --git a/.gitignore b/.gitignore index 1afea7e..34c8f75 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /.direnv /target /result +.aider* diff --git a/src/snapshots/osdb__tokens__tests__mixed_input.snap b/src/snapshots/osdb__tokens__tests__mixed_input.snap new file mode 100644 index 0000000..534151b --- /dev/null +++ b/src/snapshots/osdb__tokens__tests__mixed_input.snap @@ -0,0 +1,88 @@ +--- +source: src/tokens.rs +expression: "tokenize(r#\".exit INSERT 42 \"user\" \"email\"; SELECT\"#.to_string(),\n\"\".to_string())" +--- +Ok( + [ + Token { + location: Location { + file: "", + offset: 0, + length: 5, + }, + data: MetaCommand( + Exit, + ), + lexeme: ".exit", + }, + Token { + location: Location { + file: "", + offset: 6, + length: 6, + }, + data: Insert, + lexeme: "INSERT", + }, + Token { + location: Location { + file: "", + offset: 13, + length: 2, + }, + data: Int( + 42, + ), + lexeme: "42", + }, + Token { + location: Location { + file: "", + offset: 16, + length: 6, + }, + data: String( + "user", + ), + lexeme: "\"user\"", + }, + Token { + location: Location { + file: "", + offset: 23, + length: 7, + }, + data: String( + "email", + ), + lexeme: "\"email\"", + }, + Token { + location: Location { + file: "", + offset: 30, + length: 1, + }, + data: Semicolon, + lexeme: ";", + }, + Token { + location: Location { + file: "", + offset: 32, + length: 6, + }, + data: Select, + lexeme: "SELECT", + }, + Token { + location: Location { + file: "", + offset: 38, + length: 0, + }, + data: EndOfFile, + lexeme: "", + }, + ], +) diff --git a/src/snapshots/osdb__tokens__tests__string_errors-2.snap b/src/snapshots/osdb__tokens__tests__string_errors-2.snap new file mode 100644 index 0000000..8f29a5e --- /dev/null +++ b/src/snapshots/osdb__tokens__tests__string_errors-2.snap @@ -0,0 +1,23 @@ +--- +source: src/tokens.rs +expression: "tokenize(r#\"SELECT \"valid\"; \"invalid\"#.to_string(), \"\".to_string())" +--- +Err( + [ + ScanError { + location: Location { + file: "", + offset: 24, + length: 0, + }, + kind: UnexpectedEndOfInputWhileLookingForMatching( + '"', + Location { + file: "", + offset: 16, + length: 1, + }, + ), + }, + ], +) diff --git a/src/snapshots/osdb__tokens__tests__string_errors.snap b/src/snapshots/osdb__tokens__tests__string_errors.snap new file mode 100644 index 0000000..83f4e5b --- /dev/null +++ b/src/snapshots/osdb__tokens__tests__string_errors.snap @@ -0,0 +1,23 @@ +--- +source: src/tokens.rs +expression: "tokenize(r#\"INSERT \"unclosed string\"#.to_string(), \"\".to_string())" +--- +Err( + [ + ScanError { + location: Location { + file: "", + offset: 23, + length: 0, + }, + kind: UnexpectedEndOfInputWhileLookingForMatching( + '"', + Location { + file: "", + offset: 7, + length: 1, + }, + ), + }, + ], +) diff --git a/src/snapshots/osdb__tokens__tests__whitespace_handling.snap b/src/snapshots/osdb__tokens__tests__whitespace_handling.snap new file mode 100644 index 0000000..5fa5e88 --- /dev/null +++ b/src/snapshots/osdb__tokens__tests__whitespace_handling.snap @@ -0,0 +1,46 @@ +--- +source: src/tokens.rs +expression: "tokenize(\" INSERT \\t\\n42\\n\\t;\".to_string(), \"\".to_string())" +--- +Ok( + [ + Token { + location: Location { + file: "", + offset: 2, + length: 6, + }, + data: Insert, + lexeme: "INSERT", + }, + Token { + location: Location { + file: "", + offset: 13, + length: 2, + }, + data: Int( + 42, + ), + lexeme: "42", + }, + Token { + location: Location { + file: "", + offset: 17, + length: 1, + }, + data: Semicolon, + lexeme: ";", + }, + Token { + location: Location { + file: "", + offset: 18, + length: 0, + }, + data: EndOfFile, + lexeme: "", + }, + ], +) diff --git a/src/tokens.rs b/src/tokens.rs index 88610d2..e872030 100644 --- a/src/tokens.rs +++ b/src/tokens.rs @@ -1,23 +1,54 @@ use crate::meta_commands::MetaCommand; +/// Represents the core lexical elements of the SQL-like language. +/// +/// Tokens are produced by the tokenizer and consumed by the parser to build +/// abstract syntax trees. Each variant represents a distinct syntactic element +/// with associated data when applicable. +/// +/// # Examples +/// +/// ``` +/// use osdb::tokens::TokenData; +/// use osdb::meta_commands::MetaCommand; +/// +/// // Keyword tokens +/// let insert = TokenData::Insert; +/// let select = TokenData::Select; +/// +/// // Meta command with parameter +/// let exit_cmd = TokenData::MetaCommand(MetaCommand::Exit); +/// +/// // Literal values +/// let number = TokenData::Int(42); +/// let text = TokenData::String("hello".to_string()); +/// ``` #[derive(Debug, Eq, PartialEq, Clone)] pub enum TokenData { + /// INSERT statement keyword Insert, + /// SELECT statement keyword Select, + /// Meta command (commands starting with '.') MetaCommand(MetaCommand), + /// End of file marker EndOfFile, + /// Integer literal value Int(i64), + /// String literal value String(String), + /// Semicolon statement terminator Semicolon, } +/// Represents a location in the source input #[derive(Debug, Eq, PartialEq, Clone)] pub struct Location { - /// file name + /// Source file name pub file: String, - /// Since start of file + /// Offset from the start of the file in characters pub offset: usize, - /// Length of the litteral + /// Length of the token in characters pub length: usize, } @@ -37,12 +68,21 @@ impl Default for Location { } impl Location { + /// Creates a new Location with the given file, offset and length. + /// + /// # Examples + /// /// ``` /// use osdb::tokens::Location; - /// let location = Location::new(String::from("src/statement.sql"), 0, 10); - /// assert_eq!(location.file, "src/statement.sql"); - /// assert_eq!(location.offset, 0); - /// assert_eq!(location.length, 10); + /// + /// // Create a location for a token spanning bytes 5-15 in a file + /// let loc = Location::new("query.sql".into(), 5, 10); + /// assert_eq!(loc.offset, 5); + /// assert_eq!(loc.length, 10); + /// + /// // Zero-length location for EOF marker + /// let eof = Location::new("".into(), 20, 0); + /// assert_eq!(eof.length, 0); /// ``` pub fn new(file: String, offset: usize, length: usize) -> Self { Self { @@ -53,17 +93,56 @@ impl Location { } } +/// Represents a token in the input source #[derive(Debug, Eq, PartialEq, Clone)] pub struct Token { - /// Where in the input was this token found? + /// Location of the token in the source input pub location: Location, - /// What is in it? + /// The parsed token data pub data: TokenData, - /// What did it look like while being parsed? + /// Original text representation in the source pub lexeme: String, } +/// A lexical token with location information and parsed data. +/// +/// # Examples +/// +/// ``` +/// use osdb::tokens::{Token, TokenData, Location}; +/// use osdb::meta_commands::MetaCommand; +/// +/// let token = Token { +/// location: Location::new("input.sql".into(), 0, 6), +/// data: TokenData::Select, +/// lexeme: "SELECT".to_string(), +/// }; +/// +/// assert_eq!(format!("{token}"), "select statement \"SELECT\""); +/// ``` impl std::fmt::Display for Token { + /// Formats the token for display, showing both the semantic meaning + /// and original lexeme. + /// + /// # Examples + /// + /// ``` + /// # use osdb::tokens::{Token, TokenData, Location}; + /// # let location = Location::default(); + /// let int_token = Token { + /// location: location.clone(), + /// data: TokenData::Int(42), + /// lexeme: "42".to_string(), + /// }; + /// assert_eq!(format!("{int_token}"), "integer 42 \"42\""); + /// + /// let string_token = Token { + /// location, + /// data: TokenData::String("hello".into()), + /// lexeme: "\"hello\"".to_string(), + /// }; + /// assert_eq!(format!("{string_token}"), "string \"hello\" \"\\\"hello\\\"\""); + /// ``` fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match &self.data { TokenData::Insert => write!(f, "insert statement"), @@ -86,13 +165,20 @@ struct Tokenizer { offset: usize, } +/// Represents different kinds of errors that can occur during tokenization #[derive(Debug, Eq, PartialEq)] pub enum ScanErrorKind { + /// Encountered an unexpected character UnexpectedChar(char), + /// Reached the end of input unexpectedly UnexpectedEndOfInput, + /// Encountered an unknown keyword UnknownKeyword(String), + /// Encountered an unknown meta command UnknownMetaCommand(String), + /// Failed to parse an integer value ParseIntError(std::num::ParseIntError), + /// Reached the end of input while looking for a matching character UnexpectedEndOfInputWhileLookingForMatching(char, Location), } @@ -112,9 +198,12 @@ impl std::fmt::Display for ScanErrorKind { } } +/// Error that occurred during tokenization, with location information #[derive(Debug, Eq, PartialEq)] pub struct ScanError { + /// Location where the error occurred pub location: Location, + /// Type of scanning error pub kind: ScanErrorKind, } @@ -362,6 +451,45 @@ impl Tokenizer { } } +/// Converts a string input into a sequence of tokens +/// +/// Takes the input to tokenize and a filename for error reporting. +/// Returns either a vector of tokens or a vector of scanning errors. +/// +/// # Examples +/// +/// ## Valid inputs +/// ``` +/// # use osdb::tokens::tokenize; +/// // Basic SELECT statement +/// let tokens = tokenize("SELECT;".into(), "".into()).unwrap(); +/// assert_eq!(tokens.len(), 3); // SELECT, semicolon, EOF +/// +/// // INSERT with values +/// let insert = tokenize(r#"INSERT 42 "user" "email""#.into(), "".into()).unwrap(); +/// assert_eq!(insert.len(), 5); // INSERT, int, string, string, EOF +/// +/// // Meta-commands +/// let meta = tokenize(".exit .about".into(), "".into()).unwrap(); +/// assert_eq!(meta.len(), 3); // Two meta commands and EOF +/// ``` +/// +/// ## Invalid inputs +/// ``` +/// # use osdb::tokens::tokenize; +/// # use insta::assert_debug_snapshot; +/// // Unclosed string +/// let err = tokenize(r#"INSERT 1 "unclosed"#.into(), "".into()); +/// assert_debug_snapshot!("unclosed double quote", err); +/// +/// // Invalid integer +/// let err = tokenize("INSERT 9223372036854775808".into(), "".into()); +/// assert_debug_snapshot!("invalid integer", err); +/// +/// // Unknown meta-command +/// let err = tokenize(".invalid".into(), "".into()); +/// assert_debug_snapshot!("invalid meta-command", err); +/// ``` pub fn tokenize(input: String, file: String) -> Result, Vec> { let mut tokenizer = Tokenizer::new(input, file); let mut errors = Vec::new(); @@ -448,4 +576,32 @@ mod tests { assert_debug_snapshot!(tokenize("-".to_string(), "src/ints.sql".to_string(),)); assert_debug_snapshot!(tokenize("+".to_string(), "src/ints.sql".to_string(),)); } + + #[test] + fn test_string_errors() { + assert_debug_snapshot!(tokenize( + r#"INSERT "unclosed string"#.to_string(), + "".to_string() + )); + assert_debug_snapshot!(tokenize( + r#"SELECT "valid"; "invalid"#.to_string(), + "".to_string() + )); + } + + #[test] + fn test_mixed_input() { + assert_debug_snapshot!(tokenize( + r#".exit INSERT 42 "user" "email"; SELECT"#.to_string(), + "".to_string() + )); + } + + #[test] + fn test_whitespace_handling() { + assert_debug_snapshot!(tokenize( + " INSERT \t\n42\n\t;".to_string(), + "".to_string() + )); + } }