use crate::meta_commands::MetaCommand; #[derive(Debug, Eq, PartialEq)] pub enum TokenData { Insert, Select, MetaCommand(MetaCommand), EndOfFile, } #[derive(Debug, Eq, PartialEq)] pub struct Location { /// file name pub file: String, /// Since start of file pub offset: usize, /// Length of the litteral pub length: usize, } impl From<&Location> for std::ops::Range { fn from(val: &Location) -> Self { std::ops::Range { start: val.offset, end: val.offset + val.length, } } } impl Location { /// ``` /// use osdb::tokens::Location; /// let location = Location::new(String::from("src/statement.sql"), 0, 10); /// assert_eq!(location.file, "src/statement.sql"); /// assert_eq!(location.offset, 0); /// assert_eq!(location.length, 10); /// ``` pub fn new(file: String, offset: usize, length: usize) -> Self { Self { file, offset, length, } } } #[derive(Debug, Eq, PartialEq)] pub struct Token { /// Where in the input was this token found? pub location: Location, /// What is in it? pub data: TokenData, /// What did it look like while being parsed? pub lexeme: String, } struct Tokenizer { input: String, file: String, tokens: Vec, offset: usize, } #[derive(Debug, Eq, PartialEq)] pub enum ScanErrorKind { UnexpectedChar(char), UnexpectedEndOfInput, UnknownKeyword(String), UnknownMetaCommand(String), } impl std::fmt::Display for ScanErrorKind { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { ScanErrorKind::UnexpectedChar(c) => write!(f, "unexpected char: {c:?}"), ScanErrorKind::UnexpectedEndOfInput => write!(f, "unexpected end of input"), ScanErrorKind::UnknownKeyword(x) => write!(f, "unknown keyword: {x:?}"), ScanErrorKind::UnknownMetaCommand(x) => write!(f, "unknown meta-command: {x:?}"), } } } #[derive(Debug, Eq, PartialEq)] pub struct ScanError { pub location: Location, pub kind: ScanErrorKind, } impl std::fmt::Display for ScanError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let kind = &self.kind; write!(f, "{kind}") } } impl Tokenizer { fn new(input: String, file: String) -> Self { Self { input, file, tokens: Vec::new(), offset: 0, } } fn current_location(&self, length: usize) -> Location { Location::new(self.file.clone(), self.offset, length) } fn is_at_end(&self) -> bool { self.offset >= self.input.len() } fn peek(&self) -> Option { self.input.chars().nth(self.offset) } fn advance(&mut self) -> Option { let c = self.input.chars().nth(self.offset); self.offset += 1; c } fn recognize_keyword(word: &str) -> Option { match word.to_lowercase().as_str() { "insert" => Some(TokenData::Insert), "select" => Some(TokenData::Select), _ => None, } } fn recognize_metacommand(word: &str) -> Option { match word.to_lowercase().as_str() { ".exit" => Some(TokenData::MetaCommand(MetaCommand::Exit)), _ => None, } } fn scan_meta_command(&mut self) -> Result { let start_offset = self.offset; let mut word = String::new(); let mut length = 0; if let Some(c) = self.advance() { word.push(c); length += 1; } while let Some(c) = self.peek() { if c.is_alphabetic() || c == '_' { word.push(c); self.advance(); } else { break; } length += 1; } if let Some(meta) = Self::recognize_metacommand(&word) { Ok(Token { location: Location::new(self.file.clone(), start_offset, length), data: meta, lexeme: word, }) } else { Err(ScanError { location: Location::new(self.file.clone(), start_offset, length), kind: ScanErrorKind::UnknownMetaCommand(word), }) } } fn scan_identifier_or_keyword(&mut self) -> Result { let start_offset = self.offset; let mut word = String::new(); let mut length = 0; if let Some(c) = self.advance() { word.push(c); length += 1; } while let Some(c) = self.peek() { if Self::ident_or_keyword_inner(c) { word.push(c); self.advance(); } else { break; } length += 1; } if let Some(keyword) = Self::recognize_keyword(&word) { Ok(Token { location: Location::new(self.file.clone(), start_offset, length), data: keyword, lexeme: word, }) } else { Err(ScanError { location: Location::new(self.file.clone(), start_offset, length), kind: ScanErrorKind::UnknownKeyword(word), }) } } fn ident_or_keyword_start(c: char) -> bool { c.is_alphabetic() || c == '_' } fn ident_or_keyword_inner(c: char) -> bool { c.is_alphanumeric() || c == '_' } fn scan_token(&mut self) -> Result, ScanError> { loop { if let Some(c) = self.peek() { if Self::ident_or_keyword_start(c) { return self.scan_identifier_or_keyword().map(Some); } else if c == '.' { return self.scan_meta_command().map(Some); } else if c.is_whitespace() { self.advance(); } else { let result = Err(ScanError { location: self.current_location(1), kind: ScanErrorKind::UnexpectedChar(c), }); self.advance(); return result; } } else { return Ok(None); } } } fn finalize(&mut self) { self.tokens.push(Token { location: self.current_location(0), data: TokenData::EndOfFile, lexeme: String::new(), }); } } pub fn tokenize(input: String, file: String) -> Result, Vec> { let mut tokenizer = Tokenizer::new(input, file); let mut errors = Vec::new(); while !tokenizer.is_at_end() { let token = tokenizer.scan_token(); match token { Ok(Some(token)) => tokenizer.tokens.push(token), Ok(None) => break, Err(err) => errors.push(err), } } tokenizer.finalize(); if errors.is_empty() { Ok(tokenizer.tokens) } else { Err(errors) } } #[cfg(test)] mod tests { use super::*; use insta::assert_debug_snapshot; #[test] fn test_tokenize_meta_command() { assert_debug_snapshot!(tokenize(".exit".to_string(), "".to_string())); } #[test] fn test_tokenize_unknown_meta_command() { assert_debug_snapshot!(tokenize(".halp".to_string(), "".to_string())); } #[test] fn test_tokenizer() { let mut scanresult = tokenize("INSERT Select".to_string(), "src/statement.sql".to_string()).unwrap(); scanresult.reverse(); assert_eq!( scanresult.pop(), Some(Token { location: Location::new(String::from("src/statement.sql"), 0, 6), data: TokenData::Insert, lexeme: String::from("INSERT"), }) ); assert_eq!( scanresult.pop(), Some(Token { location: Location::new(String::from("src/statement.sql"), 7, 6), data: TokenData::Select, lexeme: String::from("Select"), }) ); assert_eq!( scanresult.pop(), Some(Token { location: Location::new(String::from("src/statement.sql"), 13, 0), data: TokenData::EndOfFile, lexeme: String::from(""), }) ); assert_eq!(scanresult.pop(), None); assert!(scanresult.is_empty()); } #[test] fn test_tokenizer_errors() { let scanerrors = tokenize("salact +".to_string(), "src/statement.sql".to_string()) .err() .unwrap(); assert_debug_snapshot!(scanerrors); } }