docs: add code comments and test cases for tokenization
This commit is contained in:
parent
33c4edf91d
commit
47a68e5069
6 changed files with 347 additions and 10 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -1,3 +1,4 @@
|
|||
/.direnv
|
||||
/target
|
||||
/result
|
||||
.aider*
|
||||
|
|
|
|||
88
src/snapshots/osdb__tokens__tests__mixed_input.snap
Normal file
88
src/snapshots/osdb__tokens__tests__mixed_input.snap
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
---
|
||||
source: src/tokens.rs
|
||||
expression: "tokenize(r#\".exit INSERT 42 \"user\" \"email\"; SELECT\"#.to_string(),\n\"<mixed>\".to_string())"
|
||||
---
|
||||
Ok(
|
||||
[
|
||||
Token {
|
||||
location: Location {
|
||||
file: "<mixed>",
|
||||
offset: 0,
|
||||
length: 5,
|
||||
},
|
||||
data: MetaCommand(
|
||||
Exit,
|
||||
),
|
||||
lexeme: ".exit",
|
||||
},
|
||||
Token {
|
||||
location: Location {
|
||||
file: "<mixed>",
|
||||
offset: 6,
|
||||
length: 6,
|
||||
},
|
||||
data: Insert,
|
||||
lexeme: "INSERT",
|
||||
},
|
||||
Token {
|
||||
location: Location {
|
||||
file: "<mixed>",
|
||||
offset: 13,
|
||||
length: 2,
|
||||
},
|
||||
data: Int(
|
||||
42,
|
||||
),
|
||||
lexeme: "42",
|
||||
},
|
||||
Token {
|
||||
location: Location {
|
||||
file: "<mixed>",
|
||||
offset: 16,
|
||||
length: 6,
|
||||
},
|
||||
data: String(
|
||||
"user",
|
||||
),
|
||||
lexeme: "\"user\"",
|
||||
},
|
||||
Token {
|
||||
location: Location {
|
||||
file: "<mixed>",
|
||||
offset: 23,
|
||||
length: 7,
|
||||
},
|
||||
data: String(
|
||||
"email",
|
||||
),
|
||||
lexeme: "\"email\"",
|
||||
},
|
||||
Token {
|
||||
location: Location {
|
||||
file: "<mixed>",
|
||||
offset: 30,
|
||||
length: 1,
|
||||
},
|
||||
data: Semicolon,
|
||||
lexeme: ";",
|
||||
},
|
||||
Token {
|
||||
location: Location {
|
||||
file: "<mixed>",
|
||||
offset: 32,
|
||||
length: 6,
|
||||
},
|
||||
data: Select,
|
||||
lexeme: "SELECT",
|
||||
},
|
||||
Token {
|
||||
location: Location {
|
||||
file: "<mixed>",
|
||||
offset: 38,
|
||||
length: 0,
|
||||
},
|
||||
data: EndOfFile,
|
||||
lexeme: "",
|
||||
},
|
||||
],
|
||||
)
|
||||
23
src/snapshots/osdb__tokens__tests__string_errors-2.snap
Normal file
23
src/snapshots/osdb__tokens__tests__string_errors-2.snap
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
---
|
||||
source: src/tokens.rs
|
||||
expression: "tokenize(r#\"SELECT \"valid\"; \"invalid\"#.to_string(), \"<input>\".to_string())"
|
||||
---
|
||||
Err(
|
||||
[
|
||||
ScanError {
|
||||
location: Location {
|
||||
file: "<input>",
|
||||
offset: 24,
|
||||
length: 0,
|
||||
},
|
||||
kind: UnexpectedEndOfInputWhileLookingForMatching(
|
||||
'"',
|
||||
Location {
|
||||
file: "<input>",
|
||||
offset: 16,
|
||||
length: 1,
|
||||
},
|
||||
),
|
||||
},
|
||||
],
|
||||
)
|
||||
23
src/snapshots/osdb__tokens__tests__string_errors.snap
Normal file
23
src/snapshots/osdb__tokens__tests__string_errors.snap
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
---
|
||||
source: src/tokens.rs
|
||||
expression: "tokenize(r#\"INSERT \"unclosed string\"#.to_string(), \"<input>\".to_string())"
|
||||
---
|
||||
Err(
|
||||
[
|
||||
ScanError {
|
||||
location: Location {
|
||||
file: "<input>",
|
||||
offset: 23,
|
||||
length: 0,
|
||||
},
|
||||
kind: UnexpectedEndOfInputWhileLookingForMatching(
|
||||
'"',
|
||||
Location {
|
||||
file: "<input>",
|
||||
offset: 7,
|
||||
length: 1,
|
||||
},
|
||||
),
|
||||
},
|
||||
],
|
||||
)
|
||||
46
src/snapshots/osdb__tokens__tests__whitespace_handling.snap
Normal file
46
src/snapshots/osdb__tokens__tests__whitespace_handling.snap
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
---
|
||||
source: src/tokens.rs
|
||||
expression: "tokenize(\" INSERT \\t\\n42\\n\\t;\".to_string(), \"<whitespace>\".to_string())"
|
||||
---
|
||||
Ok(
|
||||
[
|
||||
Token {
|
||||
location: Location {
|
||||
file: "<whitespace>",
|
||||
offset: 2,
|
||||
length: 6,
|
||||
},
|
||||
data: Insert,
|
||||
lexeme: "INSERT",
|
||||
},
|
||||
Token {
|
||||
location: Location {
|
||||
file: "<whitespace>",
|
||||
offset: 13,
|
||||
length: 2,
|
||||
},
|
||||
data: Int(
|
||||
42,
|
||||
),
|
||||
lexeme: "42",
|
||||
},
|
||||
Token {
|
||||
location: Location {
|
||||
file: "<whitespace>",
|
||||
offset: 17,
|
||||
length: 1,
|
||||
},
|
||||
data: Semicolon,
|
||||
lexeme: ";",
|
||||
},
|
||||
Token {
|
||||
location: Location {
|
||||
file: "<whitespace>",
|
||||
offset: 18,
|
||||
length: 0,
|
||||
},
|
||||
data: EndOfFile,
|
||||
lexeme: "",
|
||||
},
|
||||
],
|
||||
)
|
||||
176
src/tokens.rs
176
src/tokens.rs
|
|
@ -1,23 +1,54 @@
|
|||
use crate::meta_commands::MetaCommand;
|
||||
|
||||
/// Represents the core lexical elements of the SQL-like language.
|
||||
///
|
||||
/// Tokens are produced by the tokenizer and consumed by the parser to build
|
||||
/// abstract syntax trees. Each variant represents a distinct syntactic element
|
||||
/// with associated data when applicable.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use osdb::tokens::TokenData;
|
||||
/// use osdb::meta_commands::MetaCommand;
|
||||
///
|
||||
/// // Keyword tokens
|
||||
/// let insert = TokenData::Insert;
|
||||
/// let select = TokenData::Select;
|
||||
///
|
||||
/// // Meta command with parameter
|
||||
/// let exit_cmd = TokenData::MetaCommand(MetaCommand::Exit);
|
||||
///
|
||||
/// // Literal values
|
||||
/// let number = TokenData::Int(42);
|
||||
/// let text = TokenData::String("hello".to_string());
|
||||
/// ```
|
||||
#[derive(Debug, Eq, PartialEq, Clone)]
|
||||
pub enum TokenData {
|
||||
/// INSERT statement keyword
|
||||
Insert,
|
||||
/// SELECT statement keyword
|
||||
Select,
|
||||
/// Meta command (commands starting with '.')
|
||||
MetaCommand(MetaCommand),
|
||||
/// End of file marker
|
||||
EndOfFile,
|
||||
/// Integer literal value
|
||||
Int(i64),
|
||||
/// String literal value
|
||||
String(String),
|
||||
/// Semicolon statement terminator
|
||||
Semicolon,
|
||||
}
|
||||
|
||||
/// Represents a location in the source input
|
||||
#[derive(Debug, Eq, PartialEq, Clone)]
|
||||
pub struct Location {
|
||||
/// file name
|
||||
/// Source file name
|
||||
pub file: String,
|
||||
/// Since start of file
|
||||
/// Offset from the start of the file in characters
|
||||
pub offset: usize,
|
||||
/// Length of the litteral
|
||||
/// Length of the token in characters
|
||||
pub length: usize,
|
||||
}
|
||||
|
||||
|
|
@ -37,12 +68,21 @@ impl Default for Location {
|
|||
}
|
||||
|
||||
impl Location {
|
||||
/// Creates a new Location with the given file, offset and length.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use osdb::tokens::Location;
|
||||
/// let location = Location::new(String::from("src/statement.sql"), 0, 10);
|
||||
/// assert_eq!(location.file, "src/statement.sql");
|
||||
/// assert_eq!(location.offset, 0);
|
||||
/// assert_eq!(location.length, 10);
|
||||
///
|
||||
/// // Create a location for a token spanning bytes 5-15 in a file
|
||||
/// let loc = Location::new("query.sql".into(), 5, 10);
|
||||
/// assert_eq!(loc.offset, 5);
|
||||
/// assert_eq!(loc.length, 10);
|
||||
///
|
||||
/// // Zero-length location for EOF marker
|
||||
/// let eof = Location::new("<input>".into(), 20, 0);
|
||||
/// assert_eq!(eof.length, 0);
|
||||
/// ```
|
||||
pub fn new(file: String, offset: usize, length: usize) -> Self {
|
||||
Self {
|
||||
|
|
@ -53,17 +93,56 @@ impl Location {
|
|||
}
|
||||
}
|
||||
|
||||
/// Represents a token in the input source
|
||||
#[derive(Debug, Eq, PartialEq, Clone)]
|
||||
pub struct Token {
|
||||
/// Where in the input was this token found?
|
||||
/// Location of the token in the source input
|
||||
pub location: Location,
|
||||
/// What is in it?
|
||||
/// The parsed token data
|
||||
pub data: TokenData,
|
||||
/// What did it look like while being parsed?
|
||||
/// Original text representation in the source
|
||||
pub lexeme: String,
|
||||
}
|
||||
|
||||
/// A lexical token with location information and parsed data.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use osdb::tokens::{Token, TokenData, Location};
|
||||
/// use osdb::meta_commands::MetaCommand;
|
||||
///
|
||||
/// let token = Token {
|
||||
/// location: Location::new("input.sql".into(), 0, 6),
|
||||
/// data: TokenData::Select,
|
||||
/// lexeme: "SELECT".to_string(),
|
||||
/// };
|
||||
///
|
||||
/// assert_eq!(format!("{token}"), "select statement \"SELECT\"");
|
||||
/// ```
|
||||
impl std::fmt::Display for Token {
|
||||
/// Formats the token for display, showing both the semantic meaning
|
||||
/// and original lexeme.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// # use osdb::tokens::{Token, TokenData, Location};
|
||||
/// # let location = Location::default();
|
||||
/// let int_token = Token {
|
||||
/// location: location.clone(),
|
||||
/// data: TokenData::Int(42),
|
||||
/// lexeme: "42".to_string(),
|
||||
/// };
|
||||
/// assert_eq!(format!("{int_token}"), "integer 42 \"42\"");
|
||||
///
|
||||
/// let string_token = Token {
|
||||
/// location,
|
||||
/// data: TokenData::String("hello".into()),
|
||||
/// lexeme: "\"hello\"".to_string(),
|
||||
/// };
|
||||
/// assert_eq!(format!("{string_token}"), "string \"hello\" \"\\\"hello\\\"\"");
|
||||
/// ```
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match &self.data {
|
||||
TokenData::Insert => write!(f, "insert statement"),
|
||||
|
|
@ -86,13 +165,20 @@ struct Tokenizer {
|
|||
offset: usize,
|
||||
}
|
||||
|
||||
/// Represents different kinds of errors that can occur during tokenization
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub enum ScanErrorKind {
|
||||
/// Encountered an unexpected character
|
||||
UnexpectedChar(char),
|
||||
/// Reached the end of input unexpectedly
|
||||
UnexpectedEndOfInput,
|
||||
/// Encountered an unknown keyword
|
||||
UnknownKeyword(String),
|
||||
/// Encountered an unknown meta command
|
||||
UnknownMetaCommand(String),
|
||||
/// Failed to parse an integer value
|
||||
ParseIntError(std::num::ParseIntError),
|
||||
/// Reached the end of input while looking for a matching character
|
||||
UnexpectedEndOfInputWhileLookingForMatching(char, Location),
|
||||
}
|
||||
|
||||
|
|
@ -112,9 +198,12 @@ impl std::fmt::Display for ScanErrorKind {
|
|||
}
|
||||
}
|
||||
|
||||
/// Error that occurred during tokenization, with location information
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub struct ScanError {
|
||||
/// Location where the error occurred
|
||||
pub location: Location,
|
||||
/// Type of scanning error
|
||||
pub kind: ScanErrorKind,
|
||||
}
|
||||
|
||||
|
|
@ -362,6 +451,45 @@ impl Tokenizer {
|
|||
}
|
||||
}
|
||||
|
||||
/// Converts a string input into a sequence of tokens
|
||||
///
|
||||
/// Takes the input to tokenize and a filename for error reporting.
|
||||
/// Returns either a vector of tokens or a vector of scanning errors.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ## Valid inputs
|
||||
/// ```
|
||||
/// # use osdb::tokens::tokenize;
|
||||
/// // Basic SELECT statement
|
||||
/// let tokens = tokenize("SELECT;".into(), "<test>".into()).unwrap();
|
||||
/// assert_eq!(tokens.len(), 3); // SELECT, semicolon, EOF
|
||||
///
|
||||
/// // INSERT with values
|
||||
/// let insert = tokenize(r#"INSERT 42 "user" "email""#.into(), "<test>".into()).unwrap();
|
||||
/// assert_eq!(insert.len(), 5); // INSERT, int, string, string, EOF
|
||||
///
|
||||
/// // Meta-commands
|
||||
/// let meta = tokenize(".exit .about".into(), "<test>".into()).unwrap();
|
||||
/// assert_eq!(meta.len(), 3); // Two meta commands and EOF
|
||||
/// ```
|
||||
///
|
||||
/// ## Invalid inputs
|
||||
/// ```
|
||||
/// # use osdb::tokens::tokenize;
|
||||
/// # use insta::assert_debug_snapshot;
|
||||
/// // Unclosed string
|
||||
/// let err = tokenize(r#"INSERT 1 "unclosed"#.into(), "<test>".into());
|
||||
/// assert_debug_snapshot!("unclosed double quote", err);
|
||||
///
|
||||
/// // Invalid integer
|
||||
/// let err = tokenize("INSERT 9223372036854775808".into(), "<test>".into());
|
||||
/// assert_debug_snapshot!("invalid integer", err);
|
||||
///
|
||||
/// // Unknown meta-command
|
||||
/// let err = tokenize(".invalid".into(), "<test>".into());
|
||||
/// assert_debug_snapshot!("invalid meta-command", err);
|
||||
/// ```
|
||||
pub fn tokenize(input: String, file: String) -> Result<Vec<Token>, Vec<ScanError>> {
|
||||
let mut tokenizer = Tokenizer::new(input, file);
|
||||
let mut errors = Vec::new();
|
||||
|
|
@ -448,4 +576,32 @@ mod tests {
|
|||
assert_debug_snapshot!(tokenize("-".to_string(), "src/ints.sql".to_string(),));
|
||||
assert_debug_snapshot!(tokenize("+".to_string(), "src/ints.sql".to_string(),));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_string_errors() {
|
||||
assert_debug_snapshot!(tokenize(
|
||||
r#"INSERT "unclosed string"#.to_string(),
|
||||
"<input>".to_string()
|
||||
));
|
||||
assert_debug_snapshot!(tokenize(
|
||||
r#"SELECT "valid"; "invalid"#.to_string(),
|
||||
"<input>".to_string()
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mixed_input() {
|
||||
assert_debug_snapshot!(tokenize(
|
||||
r#".exit INSERT 42 "user" "email"; SELECT"#.to_string(),
|
||||
"<mixed>".to_string()
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_whitespace_handling() {
|
||||
assert_debug_snapshot!(tokenize(
|
||||
" INSERT \t\n42\n\t;".to_string(),
|
||||
"<whitespace>".to_string()
|
||||
));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue