From 47a68e5069c927ecaba6680c43423950fb639c85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kha=C3=AFs=20COLIN?= <khais.colin@gmail.com>
Date: Wed, 4 Jun 2025 20:44:12 +0200
Subject: [PATCH] docs: add code comments and test cases for tokenization

---
 .gitignore                                    |   1 +
 .../osdb__tokens__tests__mixed_input.snap     |  88 +++++++++
 .../osdb__tokens__tests__string_errors-2.snap |  23 +++
 .../osdb__tokens__tests__string_errors.snap   |  23 +++
 ...b__tokens__tests__whitespace_handling.snap |  46 +++++
 src/tokens.rs                                 | 176 +++++++++++++++++-
 6 files changed, 347 insertions(+), 10 deletions(-)
 create mode 100644 src/snapshots/osdb__tokens__tests__mixed_input.snap
 create mode 100644 src/snapshots/osdb__tokens__tests__string_errors-2.snap
 create mode 100644 src/snapshots/osdb__tokens__tests__string_errors.snap
 create mode 100644 src/snapshots/osdb__tokens__tests__whitespace_handling.snap
diff --git a/.gitignore b/.gitignore
index 1afea7e..34c8f75 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 /.direnv
 /target
 /result
+.aider*
diff --git a/src/snapshots/osdb__tokens__tests__mixed_input.snap b/src/snapshots/osdb__tokens__tests__mixed_input.snap
new file mode 100644
index 0000000..534151b
--- /dev/null
+++ b/src/snapshots/osdb__tokens__tests__mixed_input.snap
@@ -0,0 +1,88 @@
+---
+source: src/tokens.rs
+expression: "tokenize(r#\".exit INSERT 42 \"user\" \"email\"; SELECT\"#.to_string(),\n\"<mixed>\".to_string())"
+---
+Ok(
+    [
+        Token {
+            location: Location {
+                file: "<mixed>",
+                offset: 0,
+                length: 5,
+            },
+            data: MetaCommand(
+                Exit,
+            ),
+            lexeme: ".exit",
+        },
+        Token {
+            location: Location {
+                file: "<mixed>",
+                offset: 6,
+                length: 6,
+            },
+            data: Insert,
+            lexeme: "INSERT",
+        },
+        Token {
+            location: Location {
+                file: "<mixed>",
+                offset: 13,
+                length: 2,
+            },
+            data: Int(
+                42,
+            ),
+            lexeme: "42",
+        },
+        Token {
+            location: Location {
+                file: "<mixed>",
+                offset: 16,
+                length: 6,
+            },
+            data: String(
+                "user",
+            ),
+            lexeme: "\"user\"",
+        },
+        Token {
+            location: Location {
+                file: "<mixed>",
+                offset: 23,
+                length: 7,
+            },
+            data: String(
+                "email",
+            ),
+            lexeme: "\"email\"",
+        },
+        Token {
+            location: Location {
+                file: "<mixed>",
+                offset: 30,
+                length: 1,
+            },
+            data: Semicolon,
+            lexeme: ";",
+        },
+        Token {
+            location: Location {
+                file: "<mixed>",
+                offset: 32,
+                length: 6,
+            },
+            data: Select,
+            lexeme: "SELECT",
+        },
+        Token {
+            location: Location {
+                file: "<mixed>",
+                offset: 38,
+                length: 0,
+            },
+            data: EndOfFile,
+            lexeme: "",
+        },
+    ],
+)
diff --git a/src/snapshots/osdb__tokens__tests__string_errors-2.snap b/src/snapshots/osdb__tokens__tests__string_errors-2.snap
new file mode 100644
index 0000000..8f29a5e
--- /dev/null
+++ b/src/snapshots/osdb__tokens__tests__string_errors-2.snap
@@ -0,0 +1,23 @@
+---
+source: src/tokens.rs
+expression: "tokenize(r#\"SELECT \"valid\"; \"invalid\"#.to_string(), \"<input>\".to_string())"
+---
+Err(
+    [
+        ScanError {
+            location: Location {
+                file: "<input>",
+                offset: 24,
+                length: 0,
+            },
+            kind: UnexpectedEndOfInputWhileLookingForMatching(
+                '"',
+                Location {
+                    file: "<input>",
+                    offset: 16,
+                    length: 1,
+                },
+            ),
+        },
+    ],
+)
diff --git a/src/snapshots/osdb__tokens__tests__string_errors.snap b/src/snapshots/osdb__tokens__tests__string_errors.snap
new file mode 100644
index 0000000..83f4e5b
--- /dev/null
+++ b/src/snapshots/osdb__tokens__tests__string_errors.snap
@@ -0,0 +1,23 @@
+---
+source: src/tokens.rs
+expression: "tokenize(r#\"INSERT \"unclosed string\"#.to_string(), \"<input>\".to_string())"
+---
+Err(
+    [
+        ScanError {
+            location: Location {
+                file: "<input>",
+                offset: 23,
+                length: 0,
+            },
+            kind: UnexpectedEndOfInputWhileLookingForMatching(
+                '"',
+                Location {
+                    file: "<input>",
+                    offset: 7,
+                    length: 1,
+                },
+            ),
+        },
+    ],
+)
diff --git a/src/snapshots/osdb__tokens__tests__whitespace_handling.snap b/src/snapshots/osdb__tokens__tests__whitespace_handling.snap
new file mode 100644
index 0000000..5fa5e88
--- /dev/null
+++ b/src/snapshots/osdb__tokens__tests__whitespace_handling.snap
@@ -0,0 +1,46 @@
+---
+source: src/tokens.rs
+expression: "tokenize(\"  INSERT   \\t\\n42\\n\\t;\".to_string(), \"<whitespace>\".to_string())"
+---
+Ok(
+    [
+        Token {
+            location: Location {
+                file: "<whitespace>",
+                offset: 2,
+                length: 6,
+            },
+            data: Insert,
+            lexeme: "INSERT",
+        },
+        Token {
+            location: Location {
+                file: "<whitespace>",
+                offset: 13,
+                length: 2,
+            },
+            data: Int(
+                42,
+            ),
+            lexeme: "42",
+        },
+        Token {
+            location: Location {
+                file: "<whitespace>",
+                offset: 17,
+                length: 1,
+            },
+            data: Semicolon,
+            lexeme: ";",
+        },
+        Token {
+            location: Location {
+                file: "<whitespace>",
+                offset: 18,
+                length: 0,
+            },
+            data: EndOfFile,
+            lexeme: "",
+        },
+    ],
+)
diff --git a/src/tokens.rs b/src/tokens.rs
index 88610d2..e872030 100644
--- a/src/tokens.rs
+++ b/src/tokens.rs
@@ -1,23 +1,54 @@
 use crate::meta_commands::MetaCommand;
 
+/// Represents the core lexical elements of the SQL-like language.
+///
+/// Tokens are produced by the tokenizer and consumed by the parser to build
+/// abstract syntax trees. Each variant represents a distinct syntactic element
+/// with associated data when applicable.
+///
+/// # Examples
+///
+/// ```
+/// use osdb::tokens::TokenData;
+/// use osdb::meta_commands::MetaCommand;
+///
+/// // Keyword tokens
+/// let insert = TokenData::Insert;
+/// let select = TokenData::Select;
+///
+/// // Meta command with parameter
+/// let exit_cmd = TokenData::MetaCommand(MetaCommand::Exit);
+///
+/// // Literal values
+/// let number = TokenData::Int(42);
+/// let text = TokenData::String("hello".to_string());
+/// ```
 #[derive(Debug, Eq, PartialEq, Clone)]
 pub enum TokenData {
+    /// INSERT statement keyword
     Insert,
+    /// SELECT statement keyword
     Select,
+    /// Meta command (commands starting with '.')
     MetaCommand(MetaCommand),
+    /// End of file marker
     EndOfFile,
+    /// Integer literal value
     Int(i64),
+    /// String literal value
     String(String),
+    /// Semicolon statement terminator
     Semicolon,
 }
 
+/// Represents a location in the source input
 #[derive(Debug, Eq, PartialEq, Clone)]
 pub struct Location {
-    /// file name
+    /// Source file name
     pub file: String,
-    /// Since start of file
+    /// Offset from the start of the file in characters
     pub offset: usize,
-    /// Length of the litteral
+    /// Length of the token in characters
     pub length: usize,
 }
 
@@ -37,12 +68,21 @@ impl Default for Location {
 }
 
 impl Location {
+    /// Creates a new Location with the given file, offset and length.
+    ///
+    /// # Examples
+    ///
     /// ```
     /// use osdb::tokens::Location;
-    /// let location = Location::new(String::from("src/statement.sql"), 0, 10);
-    /// assert_eq!(location.file, "src/statement.sql");
-    /// assert_eq!(location.offset, 0);
-    /// assert_eq!(location.length, 10);
+    ///
+    /// // Create a location for a token spanning bytes 5-15 in a file
+    /// let loc = Location::new("query.sql".into(), 5, 10);
+    /// assert_eq!(loc.offset, 5);
+    /// assert_eq!(loc.length, 10);
+    ///
+    /// // Zero-length location for EOF marker
+    /// let eof = Location::new("<input>".into(), 20, 0);
+    /// assert_eq!(eof.length, 0);
     /// ```
     pub fn new(file: String, offset: usize, length: usize) -> Self {
         Self {
@@ -53,17 +93,56 @@ impl Location {
     }
 }
 
+/// Represents a token in the input source
 #[derive(Debug, Eq, PartialEq, Clone)]
 pub struct Token {
-    /// Where in the input was this token found?
+    /// Location of the token in the source input
     pub location: Location,
-    /// What is in it?
+    /// The parsed token data
     pub data: TokenData,
-    /// What did it look like while being parsed?
+    /// Original text representation in the source
     pub lexeme: String,
 }
 
+/// A lexical token with location information and parsed data.
+///
+/// # Examples
+///
+/// ```
+/// use osdb::tokens::{Token, TokenData, Location};
+/// use osdb::meta_commands::MetaCommand;
+///
+/// let token = Token {
+///     location: Location::new("input.sql".into(), 0, 6),
+///     data: TokenData::Select,
+///     lexeme: "SELECT".to_string(),
+/// };
+///
+/// assert_eq!(format!("{token}"), "select statement \"SELECT\"");
+/// ```
 impl std::fmt::Display for Token {
+    /// Formats the token for display, showing both the semantic meaning
+    /// and original lexeme.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use osdb::tokens::{Token, TokenData, Location};
+    /// # let location = Location::default();
+    /// let int_token = Token {
+    ///     location: location.clone(),
+    ///     data: TokenData::Int(42),
+    ///     lexeme: "42".to_string(),
+    /// };
+    /// assert_eq!(format!("{int_token}"), "integer 42 \"42\"");
+    ///
+    /// let string_token = Token {
+    ///     location,
+    ///     data: TokenData::String("hello".into()),
+    ///     lexeme: "\"hello\"".to_string(),
+    /// };
+    /// assert_eq!(format!("{string_token}"), "string \"hello\" \"\\\"hello\\\"\"");
+    /// ```
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         match &self.data {
             TokenData::Insert => write!(f, "insert statement"),
@@ -86,13 +165,20 @@ struct Tokenizer {
     offset: usize,
 }
 
+/// Represents different kinds of errors that can occur during tokenization
 #[derive(Debug, Eq, PartialEq)]
 pub enum ScanErrorKind {
+    /// Encountered an unexpected character
     UnexpectedChar(char),
+    /// Reached the end of input unexpectedly
     UnexpectedEndOfInput,
+    /// Encountered an unknown keyword
     UnknownKeyword(String),
+    /// Encountered an unknown meta command
     UnknownMetaCommand(String),
+    /// Failed to parse an integer value
     ParseIntError(std::num::ParseIntError),
+    /// Reached the end of input while looking for a matching character
     UnexpectedEndOfInputWhileLookingForMatching(char, Location),
 }
 
@@ -112,9 +198,12 @@ impl std::fmt::Display for ScanErrorKind {
     }
 }
 
+/// Error that occurred during tokenization, with location information
 #[derive(Debug, Eq, PartialEq)]
 pub struct ScanError {
+    /// Location where the error occurred
     pub location: Location,
+    /// Type of scanning error
     pub kind: ScanErrorKind,
 }
 
@@ -362,6 +451,45 @@ impl Tokenizer {
     }
 }
 
+/// Converts a string input into a sequence of tokens
+///
+/// Takes the input to tokenize and a filename for error reporting.
+/// Returns either a vector of tokens or a vector of scanning errors.
+///
+/// # Examples
+///
+/// ## Valid inputs
+/// ```
+/// # use osdb::tokens::tokenize;
+/// // Basic SELECT statement
+/// let tokens = tokenize("SELECT;".into(), "<test>".into()).unwrap();
+/// assert_eq!(tokens.len(), 3); // SELECT, semicolon, EOF
+///
+/// // INSERT with values
+/// let insert = tokenize(r#"INSERT 42 "user" "email""#.into(), "<test>".into()).unwrap();
+/// assert_eq!(insert.len(), 5); // INSERT, int, string, string, EOF
+///
+/// // Meta-commands
+/// let meta = tokenize(".exit .about".into(), "<test>".into()).unwrap();
+/// assert_eq!(meta.len(), 3); // Two meta commands and EOF
+/// ```
+///
+/// ## Invalid inputs
+/// ```
+/// # use osdb::tokens::tokenize;
+/// # use insta::assert_debug_snapshot;
+/// // Unclosed string
+/// let err = tokenize(r#"INSERT 1 "unclosed"#.into(), "<test>".into());
+/// assert_debug_snapshot!("unclosed double quote", err);
+///
+/// // Invalid integer
+/// let err = tokenize("INSERT 9223372036854775808".into(), "<test>".into());
+/// assert_debug_snapshot!("invalid integer", err);
+///
+/// // Unknown meta-command
+/// let err = tokenize(".invalid".into(), "<test>".into());
+/// assert_debug_snapshot!("invalid meta-command", err);
+/// ```
 pub fn tokenize(input: String, file: String) -> Result<Vec<Token>, Vec<ScanError>> {
     let mut tokenizer = Tokenizer::new(input, file);
     let mut errors = Vec::new();
@@ -448,4 +576,32 @@ mod tests {
         assert_debug_snapshot!(tokenize("-".to_string(), "src/ints.sql".to_string(),));
         assert_debug_snapshot!(tokenize("+".to_string(), "src/ints.sql".to_string(),));
     }
+
+    #[test]
+    fn test_string_errors() {
+        assert_debug_snapshot!(tokenize(
+            r#"INSERT "unclosed string"#.to_string(),
+            "<input>".to_string()
+        ));
+        assert_debug_snapshot!(tokenize(
+            r#"SELECT "valid"; "invalid"#.to_string(),
+            "<input>".to_string()
+        ));
+    }
+
+    #[test]
+    fn test_mixed_input() {
+        assert_debug_snapshot!(tokenize(
+            r#".exit INSERT 42 "user" "email"; SELECT"#.to_string(),
+            "<mixed>".to_string()
+        ));
+    }
+
+    #[test]
+    fn test_whitespace_handling() {
+        assert_debug_snapshot!(tokenize(
+            "  INSERT   \t\n42\n\t;".to_string(),
+            "<whitespace>".to_string()
+        ));
+    }
 }