refactor: split tokens into data, error, location modules
This commit is contained in:
parent
47a68e5069
commit
5d9f791353
6 changed files with 416 additions and 485 deletions
|
|
@ -94,7 +94,7 @@ impl OSDBError for ScanError {
|
|||
let report =
|
||||
Report::build(ReportKind::Error, location.clone()).with_message(format!("{self}"));
|
||||
let report = match &self.kind {
|
||||
crate::tokens::ScanErrorKind::UnexpectedEndOfInputWhileLookingForMatching(
|
||||
crate::tokens::error::ScanErrorKind::UnexpectedEndOfInputWhileLookingForMatching(
|
||||
c,
|
||||
start_location,
|
||||
) => {
|
||||
|
|
|
|||
496
src/tokens.rs
496
src/tokens.rs
|
|
@ -1,455 +1,16 @@
|
|||
use crate::meta_commands::MetaCommand;
|
||||
//! Tokenization infrastructure for the database engine
|
||||
//!
|
||||
//! This module handles lexical analysis of input strings, converting them
|
||||
//! into structured tokens with location information for error reporting.
|
||||
|
||||
/// Represents the core lexical elements of the SQL-like language.
|
||||
///
|
||||
/// Tokens are produced by the tokenizer and consumed by the parser to build
|
||||
/// abstract syntax trees. Each variant represents a distinct syntactic element
|
||||
/// with associated data when applicable.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use osdb::tokens::TokenData;
|
||||
/// use osdb::meta_commands::MetaCommand;
|
||||
///
|
||||
/// // Keyword tokens
|
||||
/// let insert = TokenData::Insert;
|
||||
/// let select = TokenData::Select;
|
||||
///
|
||||
/// // Meta command with parameter
|
||||
/// let exit_cmd = TokenData::MetaCommand(MetaCommand::Exit);
|
||||
///
|
||||
/// // Literal values
|
||||
/// let number = TokenData::Int(42);
|
||||
/// let text = TokenData::String("hello".to_string());
|
||||
/// ```
|
||||
#[derive(Debug, Eq, PartialEq, Clone)]
|
||||
pub enum TokenData {
|
||||
/// INSERT statement keyword
|
||||
Insert,
|
||||
/// SELECT statement keyword
|
||||
Select,
|
||||
/// Meta command (commands starting with '.')
|
||||
MetaCommand(MetaCommand),
|
||||
/// End of file marker
|
||||
EndOfFile,
|
||||
/// Integer literal value
|
||||
Int(i64),
|
||||
/// String literal value
|
||||
String(String),
|
||||
/// Semicolon statement terminator
|
||||
Semicolon,
|
||||
}
|
||||
pub mod data;
|
||||
pub mod error;
|
||||
pub mod location;
|
||||
mod tokenizer;
|
||||
|
||||
/// Represents a location in the source input
|
||||
#[derive(Debug, Eq, PartialEq, Clone)]
|
||||
pub struct Location {
|
||||
/// Source file name
|
||||
pub file: String,
|
||||
/// Offset from the start of the file in characters
|
||||
pub offset: usize,
|
||||
/// Length of the token in characters
|
||||
pub length: usize,
|
||||
}
|
||||
|
||||
impl From<&Location> for std::ops::Range<usize> {
|
||||
fn from(val: &Location) -> Self {
|
||||
std::ops::Range {
|
||||
start: val.offset,
|
||||
end: val.offset + val.length,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Location {
|
||||
fn default() -> Self {
|
||||
Self::new(String::from("<unknown>"), 0, 0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Location {
|
||||
/// Creates a new Location with the given file, offset and length.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use osdb::tokens::Location;
|
||||
///
|
||||
/// // Create a location for a token spanning bytes 5-15 in a file
|
||||
/// let loc = Location::new("query.sql".into(), 5, 10);
|
||||
/// assert_eq!(loc.offset, 5);
|
||||
/// assert_eq!(loc.length, 10);
|
||||
///
|
||||
/// // Zero-length location for EOF marker
|
||||
/// let eof = Location::new("<input>".into(), 20, 0);
|
||||
/// assert_eq!(eof.length, 0);
|
||||
/// ```
|
||||
pub fn new(file: String, offset: usize, length: usize) -> Self {
|
||||
Self {
|
||||
file,
|
||||
offset,
|
||||
length,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents a token in the input source
|
||||
#[derive(Debug, Eq, PartialEq, Clone)]
|
||||
pub struct Token {
|
||||
/// Location of the token in the source input
|
||||
pub location: Location,
|
||||
/// The parsed token data
|
||||
pub data: TokenData,
|
||||
/// Original text representation in the source
|
||||
pub lexeme: String,
|
||||
}
|
||||
|
||||
/// A lexical token with location information and parsed data.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use osdb::tokens::{Token, TokenData, Location};
|
||||
/// use osdb::meta_commands::MetaCommand;
|
||||
///
|
||||
/// let token = Token {
|
||||
/// location: Location::new("input.sql".into(), 0, 6),
|
||||
/// data: TokenData::Select,
|
||||
/// lexeme: "SELECT".to_string(),
|
||||
/// };
|
||||
///
|
||||
/// assert_eq!(format!("{token}"), "select statement \"SELECT\"");
|
||||
/// ```
|
||||
impl std::fmt::Display for Token {
|
||||
/// Formats the token for display, showing both the semantic meaning
|
||||
/// and original lexeme.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// # use osdb::tokens::{Token, TokenData, Location};
|
||||
/// # let location = Location::default();
|
||||
/// let int_token = Token {
|
||||
/// location: location.clone(),
|
||||
/// data: TokenData::Int(42),
|
||||
/// lexeme: "42".to_string(),
|
||||
/// };
|
||||
/// assert_eq!(format!("{int_token}"), "integer 42 \"42\"");
|
||||
///
|
||||
/// let string_token = Token {
|
||||
/// location,
|
||||
/// data: TokenData::String("hello".into()),
|
||||
/// lexeme: "\"hello\"".to_string(),
|
||||
/// };
|
||||
/// assert_eq!(format!("{string_token}"), "string \"hello\" \"\\\"hello\\\"\"");
|
||||
/// ```
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match &self.data {
|
||||
TokenData::Insert => write!(f, "insert statement"),
|
||||
TokenData::Select => write!(f, "select statement"),
|
||||
TokenData::MetaCommand(x) => write!(f, "meta-command {x}"),
|
||||
TokenData::EndOfFile => write!(f, "end of file"),
|
||||
TokenData::Int(x) => write!(f, "integer {x}"),
|
||||
TokenData::String(x) => write!(f, "string {x:?}"),
|
||||
TokenData::Semicolon => write!(f, "semicolon"),
|
||||
}?;
|
||||
let lexeme = &self.lexeme;
|
||||
write!(f, " {lexeme:?}")
|
||||
}
|
||||
}
|
||||
|
||||
struct Tokenizer {
|
||||
input: String,
|
||||
file: String,
|
||||
tokens: Vec<Token>,
|
||||
offset: usize,
|
||||
}
|
||||
|
||||
/// Represents different kinds of errors that can occur during tokenization
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub enum ScanErrorKind {
|
||||
/// Encountered an unexpected character
|
||||
UnexpectedChar(char),
|
||||
/// Reached the end of input unexpectedly
|
||||
UnexpectedEndOfInput,
|
||||
/// Encountered an unknown keyword
|
||||
UnknownKeyword(String),
|
||||
/// Encountered an unknown meta command
|
||||
UnknownMetaCommand(String),
|
||||
/// Failed to parse an integer value
|
||||
ParseIntError(std::num::ParseIntError),
|
||||
/// Reached the end of input while looking for a matching character
|
||||
UnexpectedEndOfInputWhileLookingForMatching(char, Location),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ScanErrorKind {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match self {
|
||||
ScanErrorKind::UnexpectedChar(c) => write!(f, "unexpected char: {c:?}"),
|
||||
ScanErrorKind::UnexpectedEndOfInput => write!(f, "unexpected end of input"),
|
||||
ScanErrorKind::UnknownKeyword(x) => write!(f, "unknown keyword: {x:?}"),
|
||||
ScanErrorKind::UnknownMetaCommand(x) => write!(f, "unknown meta-command: {x:?}"),
|
||||
ScanErrorKind::ParseIntError(x) => write!(f, "failed to parse integer: {x}"),
|
||||
ScanErrorKind::UnexpectedEndOfInputWhileLookingForMatching(c, _) => write!(
|
||||
f,
|
||||
"unexpected end of input while looking for matching {c:?}"
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Error that occurred during tokenization, with location information
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub struct ScanError {
|
||||
/// Location where the error occurred
|
||||
pub location: Location,
|
||||
/// Type of scanning error
|
||||
pub kind: ScanErrorKind,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ScanError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let kind = &self.kind;
|
||||
write!(f, "{kind}")
|
||||
}
|
||||
}
|
||||
|
||||
impl Tokenizer {
|
||||
fn new(input: String, file: String) -> Self {
|
||||
Self {
|
||||
input,
|
||||
file,
|
||||
tokens: Vec::new(),
|
||||
offset: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn current_location(&self, length: usize) -> Location {
|
||||
Location::new(self.file.clone(), self.offset, length)
|
||||
}
|
||||
|
||||
fn previous_location(&self, length: usize) -> Location {
|
||||
Location::new(self.file.clone(), self.offset - 1, length)
|
||||
}
|
||||
|
||||
fn is_at_end(&self) -> bool {
|
||||
self.offset >= self.input.len()
|
||||
}
|
||||
|
||||
fn peek(&self) -> Option<char> {
|
||||
self.input.chars().nth(self.offset)
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> Option<char> {
|
||||
let c = self.input.chars().nth(self.offset);
|
||||
self.offset += 1;
|
||||
c
|
||||
}
|
||||
|
||||
fn recognize_keyword(word: &str) -> Option<TokenData> {
|
||||
match word.to_lowercase().as_str() {
|
||||
"insert" => Some(TokenData::Insert),
|
||||
"select" => Some(TokenData::Select),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn recognize_metacommand(word: &str) -> Option<TokenData> {
|
||||
match word.to_lowercase().as_str() {
|
||||
".exit" => Some(TokenData::MetaCommand(MetaCommand::Exit)),
|
||||
".about" => Some(TokenData::MetaCommand(MetaCommand::About)),
|
||||
".version" => Some(TokenData::MetaCommand(MetaCommand::Version)),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn scan_meta_command(&mut self) -> Result<Token, ScanError> {
|
||||
let start_offset = self.offset;
|
||||
let mut word = String::new();
|
||||
let mut length = 0;
|
||||
if let Some(c) = self.advance() {
|
||||
word.push(c);
|
||||
length += 1;
|
||||
}
|
||||
while let Some(c) = self.peek() {
|
||||
if c.is_alphabetic() || c == '_' {
|
||||
word.push(c);
|
||||
self.advance();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
length += 1;
|
||||
}
|
||||
if let Some(meta) = Self::recognize_metacommand(&word) {
|
||||
Ok(Token {
|
||||
location: Location::new(self.file.clone(), start_offset, length),
|
||||
data: meta,
|
||||
lexeme: word,
|
||||
})
|
||||
} else {
|
||||
Err(ScanError {
|
||||
location: Location::new(self.file.clone(), start_offset, length),
|
||||
kind: ScanErrorKind::UnknownMetaCommand(word),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn scan_identifier_or_keyword(&mut self) -> Result<Token, ScanError> {
|
||||
let start_offset = self.offset;
|
||||
let mut word = String::new();
|
||||
let mut length = 0;
|
||||
if let Some(c) = self.advance() {
|
||||
word.push(c);
|
||||
length += 1;
|
||||
}
|
||||
while let Some(c) = self.peek() {
|
||||
if Self::ident_or_keyword_inner(c) {
|
||||
word.push(c);
|
||||
self.advance();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
length += 1;
|
||||
}
|
||||
if let Some(keyword) = Self::recognize_keyword(&word) {
|
||||
Ok(Token {
|
||||
location: Location::new(self.file.clone(), start_offset, length),
|
||||
data: keyword,
|
||||
lexeme: word,
|
||||
})
|
||||
} else {
|
||||
Err(ScanError {
|
||||
location: Location::new(self.file.clone(), start_offset, length),
|
||||
kind: ScanErrorKind::UnknownKeyword(word),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn ident_or_keyword_start(c: char) -> bool {
|
||||
c.is_alphabetic() || c == '_'
|
||||
}
|
||||
|
||||
fn ident_or_keyword_inner(c: char) -> bool {
|
||||
c.is_alphanumeric() || c == '_'
|
||||
}
|
||||
|
||||
fn digit(c: char) -> bool {
|
||||
c.is_ascii_digit() || c == '-' || c == '+'
|
||||
}
|
||||
|
||||
fn scan_integer(&mut self) -> Result<Token, ScanError> {
|
||||
let start_offset = self.offset;
|
||||
let mut word = String::new();
|
||||
let mut length = 0;
|
||||
if let Some(c) = self.advance() {
|
||||
word.push(c);
|
||||
length += 1;
|
||||
}
|
||||
while let Some(c) = self.peek() {
|
||||
if Self::digit(c) {
|
||||
word.push(c);
|
||||
self.advance();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
length += 1;
|
||||
}
|
||||
match word.parse::<i64>() {
|
||||
Ok(int) => Ok(Token {
|
||||
location: Location::new(self.file.clone(), start_offset, length),
|
||||
data: TokenData::Int(int),
|
||||
lexeme: word,
|
||||
}),
|
||||
Err(e) => Err(ScanError {
|
||||
location: Location::new(self.file.clone(), start_offset, length),
|
||||
kind: ScanErrorKind::ParseIntError(e),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
fn scan_string(&mut self) -> Result<Token, ScanError> {
|
||||
let start_offset = self.offset;
|
||||
let mut word = String::new();
|
||||
let mut lexeme = String::new();
|
||||
let mut length = 0;
|
||||
let mut valid = false;
|
||||
if let Some(c) = self.advance() {
|
||||
lexeme.push(c);
|
||||
length += 1;
|
||||
}
|
||||
while let Some(c) = self.advance() {
|
||||
lexeme.push(c);
|
||||
length += 1;
|
||||
if c == '"' {
|
||||
valid = true;
|
||||
break;
|
||||
} else {
|
||||
word.push(c);
|
||||
}
|
||||
}
|
||||
if valid {
|
||||
Ok(Token {
|
||||
location: Location::new(self.file.clone(), start_offset, length),
|
||||
data: TokenData::String(word),
|
||||
lexeme,
|
||||
})
|
||||
} else {
|
||||
Err(ScanError {
|
||||
location: self.previous_location(0),
|
||||
kind: ScanErrorKind::UnexpectedEndOfInputWhileLookingForMatching(
|
||||
'"',
|
||||
Location::new(self.file.clone(), start_offset, 1),
|
||||
),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn scan_semicolon(&mut self) -> Result<Token, ScanError> {
|
||||
self.advance();
|
||||
Ok(Token {
|
||||
location: self.previous_location(1),
|
||||
data: TokenData::Semicolon,
|
||||
lexeme: String::from(";"),
|
||||
})
|
||||
}
|
||||
|
||||
fn scan_token(&mut self) -> Result<Option<Token>, ScanError> {
|
||||
loop {
|
||||
if let Some(c) = self.peek() {
|
||||
if Self::ident_or_keyword_start(c) {
|
||||
return self.scan_identifier_or_keyword().map(Some);
|
||||
} else if c == '.' {
|
||||
return self.scan_meta_command().map(Some);
|
||||
} else if Self::digit(c) {
|
||||
return self.scan_integer().map(Some);
|
||||
} else if c == '"' {
|
||||
return self.scan_string().map(Some);
|
||||
} else if c == ';' {
|
||||
return self.scan_semicolon().map(Some);
|
||||
} else if c.is_whitespace() {
|
||||
self.advance();
|
||||
} else {
|
||||
let result = Err(ScanError {
|
||||
location: self.current_location(1),
|
||||
kind: ScanErrorKind::UnexpectedChar(c),
|
||||
});
|
||||
self.advance();
|
||||
return result;
|
||||
}
|
||||
} else {
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn finalize(&mut self) {
|
||||
self.tokens.push(Token {
|
||||
location: self.current_location(0),
|
||||
data: TokenData::EndOfFile,
|
||||
lexeme: String::new(),
|
||||
});
|
||||
}
|
||||
}
|
||||
pub use data::{Token, TokenData};
|
||||
pub use error::ScanError;
|
||||
pub use location::Location;
|
||||
|
||||
/// Converts a string input into a sequence of tokens
|
||||
///
|
||||
|
|
@ -491,7 +52,7 @@ impl Tokenizer {
|
|||
/// assert_debug_snapshot!("invalid meta-command", err);
|
||||
/// ```
|
||||
pub fn tokenize(input: String, file: String) -> Result<Vec<Token>, Vec<ScanError>> {
|
||||
let mut tokenizer = Tokenizer::new(input, file);
|
||||
let mut tokenizer = tokenizer::Tokenizer::new(input, file);
|
||||
let mut errors = Vec::new();
|
||||
while !tokenizer.is_at_end() {
|
||||
let token = tokenizer.scan_token();
|
||||
|
|
@ -524,39 +85,6 @@ mod tests {
|
|||
assert_debug_snapshot!(tokenize(".halp".to_string(), "<stdin>".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer() {
|
||||
let mut scanresult =
|
||||
tokenize("INSERT Select".to_string(), "src/statement.sql".to_string()).unwrap();
|
||||
scanresult.reverse();
|
||||
assert_eq!(
|
||||
scanresult.pop(),
|
||||
Some(Token {
|
||||
location: Location::new(String::from("src/statement.sql"), 0, 6),
|
||||
data: TokenData::Insert,
|
||||
lexeme: String::from("INSERT"),
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
scanresult.pop(),
|
||||
Some(Token {
|
||||
location: Location::new(String::from("src/statement.sql"), 7, 6),
|
||||
data: TokenData::Select,
|
||||
lexeme: String::from("Select"),
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
scanresult.pop(),
|
||||
Some(Token {
|
||||
location: Location::new(String::from("src/statement.sql"), 13, 0),
|
||||
data: TokenData::EndOfFile,
|
||||
lexeme: String::from(""),
|
||||
})
|
||||
);
|
||||
assert_eq!(scanresult.pop(), None);
|
||||
assert!(scanresult.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_errors() {
|
||||
assert_debug_snapshot!(tokenize(
|
||||
|
|
|
|||
36
src/tokens/data.rs
Normal file
36
src/tokens/data.rs
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
use crate::meta_commands::MetaCommand;
|
||||
|
||||
/// Represents the core lexical elements of the SQL-like language
|
||||
#[derive(Debug, Eq, PartialEq, Clone)]
|
||||
pub enum TokenData {
|
||||
Insert,
|
||||
Select,
|
||||
MetaCommand(MetaCommand),
|
||||
EndOfFile,
|
||||
Int(i64),
|
||||
String(String),
|
||||
Semicolon,
|
||||
}
|
||||
|
||||
/// Represents a token in the input source
|
||||
#[derive(Debug, Eq, PartialEq, Clone)]
|
||||
pub struct Token {
|
||||
pub location: super::Location,
|
||||
pub data: TokenData,
|
||||
pub lexeme: String,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Token {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match &self.data {
|
||||
TokenData::Insert => write!(f, "insert statement"),
|
||||
TokenData::Select => write!(f, "select statement"),
|
||||
TokenData::MetaCommand(x) => write!(f, "meta-command {x}"),
|
||||
TokenData::EndOfFile => write!(f, "end of file"),
|
||||
TokenData::Int(x) => write!(f, "integer {x}"),
|
||||
TokenData::String(x) => write!(f, "string {x:?}"),
|
||||
TokenData::Semicolon => write!(f, "semicolon"),
|
||||
}?;
|
||||
write!(f, " {:?}", &self.lexeme)
|
||||
}
|
||||
}
|
||||
39
src/tokens/error.rs
Normal file
39
src/tokens/error.rs
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
use super::Location;
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub enum ScanErrorKind {
|
||||
UnexpectedChar(char),
|
||||
UnexpectedEndOfInput,
|
||||
UnknownKeyword(String),
|
||||
UnknownMetaCommand(String),
|
||||
ParseIntError(std::num::ParseIntError),
|
||||
UnexpectedEndOfInputWhileLookingForMatching(char, Location),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ScanErrorKind {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match self {
|
||||
ScanErrorKind::UnexpectedChar(c) => write!(f, "unexpected char: {c:?}"),
|
||||
ScanErrorKind::UnexpectedEndOfInput => write!(f, "unexpected end of input"),
|
||||
ScanErrorKind::UnknownKeyword(x) => write!(f, "unknown keyword: {x:?}"),
|
||||
ScanErrorKind::UnknownMetaCommand(x) => write!(f, "unknown meta-command: {x:?}"),
|
||||
ScanErrorKind::ParseIntError(x) => write!(f, "failed to parse integer: {x}"),
|
||||
ScanErrorKind::UnexpectedEndOfInputWhileLookingForMatching(c, _) => write!(
|
||||
f,
|
||||
"unexpected end of input while looking for matching {c:?}"
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub struct ScanError {
|
||||
pub location: Location,
|
||||
pub kind: ScanErrorKind,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ScanError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", &self.kind)
|
||||
}
|
||||
}
|
||||
32
src/tokens/location.rs
Normal file
32
src/tokens/location.rs
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
/// Represents a location in the source input
|
||||
#[derive(Debug, Eq, PartialEq, Clone)]
|
||||
pub struct Location {
|
||||
pub file: String,
|
||||
pub offset: usize,
|
||||
pub length: usize,
|
||||
}
|
||||
|
||||
impl From<&Location> for std::ops::Range<usize> {
|
||||
fn from(val: &Location) -> Self {
|
||||
std::ops::Range {
|
||||
start: val.offset,
|
||||
end: val.offset + val.length,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Location {
|
||||
fn default() -> Self {
|
||||
Self::new(String::from("<unknown>"), 0, 0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Location {
|
||||
pub fn new(file: String, offset: usize, length: usize) -> Self {
|
||||
Self {
|
||||
file,
|
||||
offset,
|
||||
length,
|
||||
}
|
||||
}
|
||||
}
|
||||
296
src/tokens/tokenizer.rs
Normal file
296
src/tokens/tokenizer.rs
Normal file
|
|
@ -0,0 +1,296 @@
|
|||
use super::data::{Token, TokenData};
|
||||
use super::error::{ScanError, ScanErrorKind};
|
||||
use super::location::Location;
|
||||
use crate::meta_commands::MetaCommand;
|
||||
|
||||
pub(super) struct Tokenizer {
|
||||
input: String,
|
||||
file: String,
|
||||
pub(super) tokens: Vec<Token>,
|
||||
offset: usize,
|
||||
}
|
||||
|
||||
impl Tokenizer {
|
||||
pub(super) fn new(input: String, file: String) -> Self {
|
||||
Self {
|
||||
input,
|
||||
file,
|
||||
tokens: Vec::new(),
|
||||
offset: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn current_location(&self, length: usize) -> Location {
|
||||
Location::new(self.file.clone(), self.offset, length)
|
||||
}
|
||||
|
||||
fn previous_location(&self, length: usize) -> Location {
|
||||
Location::new(self.file.clone(), self.offset - 1, length)
|
||||
}
|
||||
|
||||
pub(super) fn is_at_end(&self) -> bool {
|
||||
self.offset >= self.input.len()
|
||||
}
|
||||
|
||||
fn peek(&self) -> Option<char> {
|
||||
self.input.chars().nth(self.offset)
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> Option<char> {
|
||||
let c = self.input.chars().nth(self.offset);
|
||||
self.offset += 1;
|
||||
c
|
||||
}
|
||||
|
||||
fn recognize_keyword(word: &str) -> Option<TokenData> {
|
||||
match word.to_lowercase().as_str() {
|
||||
"insert" => Some(TokenData::Insert),
|
||||
"select" => Some(TokenData::Select),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn recognize_metacommand(word: &str) -> Option<TokenData> {
|
||||
match word.to_lowercase().as_str() {
|
||||
".exit" => Some(TokenData::MetaCommand(MetaCommand::Exit)),
|
||||
".about" => Some(TokenData::MetaCommand(MetaCommand::About)),
|
||||
".version" => Some(TokenData::MetaCommand(MetaCommand::Version)),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn scan_meta_command(&mut self) -> Result<Token, ScanError> {
|
||||
let start_offset = self.offset;
|
||||
let mut word = String::new();
|
||||
let mut length = 0;
|
||||
if let Some(c) = self.advance() {
|
||||
word.push(c);
|
||||
length += 1;
|
||||
}
|
||||
while let Some(c) = self.peek() {
|
||||
if c.is_alphabetic() || c == '_' {
|
||||
word.push(c);
|
||||
self.advance();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
length += 1;
|
||||
}
|
||||
if let Some(meta) = Self::recognize_metacommand(&word) {
|
||||
Ok(Token {
|
||||
location: Location::new(self.file.clone(), start_offset, length),
|
||||
data: meta,
|
||||
lexeme: word,
|
||||
})
|
||||
} else {
|
||||
Err(ScanError {
|
||||
location: Location::new(self.file.clone(), start_offset, length),
|
||||
kind: ScanErrorKind::UnknownMetaCommand(word),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn scan_identifier_or_keyword(&mut self) -> Result<Token, ScanError> {
|
||||
let start_offset = self.offset;
|
||||
let mut word = String::new();
|
||||
let mut length = 0;
|
||||
if let Some(c) = self.advance() {
|
||||
word.push(c);
|
||||
length += 1;
|
||||
}
|
||||
while let Some(c) = self.peek() {
|
||||
if Self::ident_or_keyword_inner(c) {
|
||||
word.push(c);
|
||||
self.advance();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
length += 1;
|
||||
}
|
||||
if let Some(keyword) = Self::recognize_keyword(&word) {
|
||||
Ok(Token {
|
||||
location: Location::new(self.file.clone(), start_offset, length),
|
||||
data: keyword,
|
||||
lexeme: word,
|
||||
})
|
||||
} else {
|
||||
Err(ScanError {
|
||||
location: Location::new(self.file.clone(), start_offset, length),
|
||||
kind: ScanErrorKind::UnknownKeyword(word),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn ident_or_keyword_start(c: char) -> bool {
|
||||
c.is_alphabetic() || c == '_'
|
||||
}
|
||||
|
||||
fn ident_or_keyword_inner(c: char) -> bool {
|
||||
c.is_alphanumeric() || c == '_'
|
||||
}
|
||||
|
||||
fn digit(c: char) -> bool {
|
||||
c.is_ascii_digit() || c == '-' || c == '+'
|
||||
}
|
||||
|
||||
fn scan_integer(&mut self) -> Result<Token, ScanError> {
|
||||
let start_offset = self.offset;
|
||||
let mut word = String::new();
|
||||
let mut length = 0;
|
||||
if let Some(c) = self.advance() {
|
||||
word.push(c);
|
||||
length += 1;
|
||||
}
|
||||
while let Some(c) = self.peek() {
|
||||
if Self::digit(c) {
|
||||
word.push(c);
|
||||
self.advance();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
length += 1;
|
||||
}
|
||||
match word.parse::<i64>() {
|
||||
Ok(int) => Ok(Token {
|
||||
location: Location::new(self.file.clone(), start_offset, length),
|
||||
data: TokenData::Int(int),
|
||||
lexeme: word,
|
||||
}),
|
||||
Err(e) => Err(ScanError {
|
||||
location: Location::new(self.file.clone(), start_offset, length),
|
||||
kind: ScanErrorKind::ParseIntError(e),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
fn scan_string(&mut self) -> Result<Token, ScanError> {
|
||||
let start_offset = self.offset;
|
||||
let mut word = String::new();
|
||||
let mut lexeme = String::new();
|
||||
let mut length = 0;
|
||||
let mut valid = false;
|
||||
if let Some(c) = self.advance() {
|
||||
lexeme.push(c);
|
||||
length += 1;
|
||||
}
|
||||
while let Some(c) = self.advance() {
|
||||
lexeme.push(c);
|
||||
length += 1;
|
||||
if c == '"' {
|
||||
valid = true;
|
||||
break;
|
||||
} else {
|
||||
word.push(c);
|
||||
}
|
||||
}
|
||||
if valid {
|
||||
Ok(Token {
|
||||
location: Location::new(self.file.clone(), start_offset, length),
|
||||
data: TokenData::String(word),
|
||||
lexeme,
|
||||
})
|
||||
} else {
|
||||
Err(ScanError {
|
||||
location: self.previous_location(0),
|
||||
kind: ScanErrorKind::UnexpectedEndOfInputWhileLookingForMatching(
|
||||
'"',
|
||||
Location::new(self.file.clone(), start_offset, 1),
|
||||
),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn scan_semicolon(&mut self) -> Result<Token, ScanError> {
|
||||
self.advance();
|
||||
Ok(Token {
|
||||
location: self.previous_location(1),
|
||||
data: TokenData::Semicolon,
|
||||
lexeme: String::from(";"),
|
||||
})
|
||||
}
|
||||
|
||||
pub(super) fn scan_token(&mut self) -> Result<Option<Token>, ScanError> {
|
||||
loop {
|
||||
if let Some(c) = self.peek() {
|
||||
if Self::ident_or_keyword_start(c) {
|
||||
return self.scan_identifier_or_keyword().map(Some);
|
||||
} else if c == '.' {
|
||||
return self.scan_meta_command().map(Some);
|
||||
} else if Self::digit(c) {
|
||||
return self.scan_integer().map(Some);
|
||||
} else if c == '"' {
|
||||
return self.scan_string().map(Some);
|
||||
} else if c == ';' {
|
||||
return self.scan_semicolon().map(Some);
|
||||
} else if c.is_whitespace() {
|
||||
self.advance();
|
||||
} else {
|
||||
let result = Err(ScanError {
|
||||
location: self.current_location(1),
|
||||
kind: ScanErrorKind::UnexpectedChar(c),
|
||||
});
|
||||
self.advance();
|
||||
return result;
|
||||
}
|
||||
} else {
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn finalize(&mut self) {
|
||||
self.tokens.push(Token {
|
||||
location: self.current_location(0),
|
||||
data: TokenData::EndOfFile,
|
||||
lexeme: String::new(),
|
||||
});
|
||||
}
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tokens::{Token, TokenData};
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer() {
|
||||
let mut tokenizer = Tokenizer::new("INSERT Select".into(), "src/statement.sql".into());
|
||||
let mut tokens = Vec::new();
|
||||
|
||||
while !tokenizer.is_at_end() {
|
||||
if let Ok(Some(token)) = tokenizer.scan_token() {
|
||||
tokens.push(token);
|
||||
}
|
||||
}
|
||||
tokenizer.finalize();
|
||||
tokens.extend(tokenizer.tokens);
|
||||
|
||||
let mut scanresult = tokens;
|
||||
scanresult.reverse();
|
||||
assert_eq!(
|
||||
scanresult.pop(),
|
||||
Some(Token {
|
||||
location: Location::new(String::from("src/statement.sql"), 0, 6),
|
||||
data: TokenData::Insert,
|
||||
lexeme: String::from("INSERT"),
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
scanresult.pop(),
|
||||
Some(Token {
|
||||
location: Location::new(String::from("src/statement.sql"), 7, 6),
|
||||
data: TokenData::Select,
|
||||
lexeme: String::from("Select"),
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
scanresult.pop(),
|
||||
Some(Token {
|
||||
location: Location::new(String::from("src/statement.sql"), 13, 0),
|
||||
data: TokenData::EndOfFile,
|
||||
lexeme: String::from(""),
|
||||
})
|
||||
);
|
||||
assert_eq!(scanresult.pop(), None);
|
||||
assert!(scanresult.is_empty());
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue