diff --git a/src/lib.rs b/src/lib.rs index ae59f40..63ea784 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,3 +3,4 @@ pub mod cli; pub mod command; pub mod meta_commands; pub mod statements; +pub mod tokens; diff --git a/src/tokens.rs b/src/tokens.rs new file mode 100644 index 0000000..5ee06a4 --- /dev/null +++ b/src/tokens.rs @@ -0,0 +1,258 @@ +#[derive(Debug, Eq, PartialEq)] +pub enum TokenData { + /// INSERT + Insert, + /// SELECT + Select, + /// 0, 1, -21635, 867463 + Integer(i64), + /// Hello World! + String(String), + /// No file O.O? + EndOfFile, +} + +#[derive(Debug, Eq, PartialEq)] +pub struct Location { + /// file name + pub file: String, + /// Since start of file + pub offset: usize, + /// Length of the litteral + pub length: usize, +} + +impl Location { + /// ``` + /// use osdb::tokens::Location; + /// let location = Location::new(String::from("src/statement.sql"), 0, 10); + /// assert_eq!(location.file, "src/statement.sql"); + /// assert_eq!(location.offset, 0); + /// assert_eq!(location.length, 10); + /// ``` + pub fn new(file: String, offset: usize, length: usize) -> Self { + Self { + file, + offset, + length, + } + } +} + +#[derive(Debug, Eq, PartialEq)] +pub struct Token { + /// Where in the input was this token found? + pub location: Location, + /// What is in it? + pub data: TokenData, + /// What did it look like while being parsed? + pub lexeme: String, +} + +struct Tokenizer { + input: String, + file: String, + tokens: Vec, + offset: usize, +} + +#[derive(Debug, Eq, PartialEq)] +pub enum ScanErrorKind { + UnexpectedChar(char), + UnexpectedEndOfInput, + UnknownKeyword(String), +} + +#[derive(Debug, Eq, PartialEq)] +pub struct ScanError { + pub location: Location, + pub kind: ScanErrorKind, +} + +impl Tokenizer { + fn new(input: String, file: String) -> Self { + Self { + input, + file, + tokens: Vec::new(), + offset: 0, + } + } + + fn current_location(&self, length: usize) -> Location { + Location::new(self.file.clone(), self.offset, length) + } + + fn is_at_end(&self) -> bool { + self.offset >= self.input.len() + } + + fn peek(&self) -> Option { + self.input.chars().nth(self.offset) + } + + fn advance(&mut self) -> Option { + let c = self.input.chars().nth(self.offset); + self.offset += 1; + c + } + + fn recognize_keyword(word: &str) -> Option { + match word.to_lowercase().as_str() { + "insert" => Some(TokenData::Insert), + "select" => Some(TokenData::Select), + _ => None, + } + } + + fn scan_identifier_or_keyword(&mut self) -> Result { + let start_offset = self.offset; + let mut word = String::new(); + let mut length = 0; + if let Some(c) = self.advance() { + word.push(c); + length += 1; + } + while let Some(c) = self.peek() { + if Self::ident_or_keyword_inner(c) { + word.push(c); + self.advance(); + } else { + break; + } + length += 1; + } + if let Some(keyword) = Self::recognize_keyword(&word) { + Ok(Token { + location: Location::new(self.file.clone(), start_offset, length), + data: keyword, + lexeme: word, + }) + } else { + Err(ScanError { + location: Location::new(self.file.clone(), start_offset, length), + kind: ScanErrorKind::UnknownKeyword(word), + }) + } + } + + fn ident_or_keyword_start(c: char) -> bool { + c.is_alphabetic() || c == '_' + } + + fn ident_or_keyword_inner(c: char) -> bool { + c.is_alphanumeric() || c == '_' + } + + fn scan_token(&mut self) -> Result { + loop { + if let Some(c) = self.peek() { + if Self::ident_or_keyword_start(c) { + return self.scan_identifier_or_keyword(); + } else if c.is_whitespace() { + self.advance(); + } else { + self.advance(); + return Err(ScanError { + location: self.current_location(1), + kind: ScanErrorKind::UnexpectedChar(c), + }); + } + } else { + return Err(ScanError { + location: self.current_location(0), + kind: ScanErrorKind::UnexpectedEndOfInput, + }); + } + } + } + + fn finalize(&mut self) { + self.tokens.push(Token { + location: self.current_location(0), + data: TokenData::EndOfFile, + lexeme: String::new(), + }); + } +} + +pub fn tokenize(input: String, file: String) -> Result, Vec> { + let mut tokenizer = Tokenizer::new(input, file); + let mut errors = Vec::new(); + while !tokenizer.is_at_end() { + match tokenizer.scan_token() { + Ok(token) => tokenizer.tokens.push(token), + Err(err) => errors.push(err), + } + } + tokenizer.finalize(); + if errors.is_empty() { + Ok(tokenizer.tokens) + } else { + Err(errors) + } +} + +#[test] +fn test_tokenizer() { + let mut scanresult = + tokenize("INSERT Select".to_string(), "src/statement.sql".to_string()).unwrap(); + scanresult.reverse(); + assert_eq!( + scanresult.pop(), + Some(Token { + location: Location::new(String::from("src/statement.sql"), 0, 6), + data: TokenData::Insert, + lexeme: String::from("INSERT"), + }) + ); + assert_eq!( + scanresult.pop(), + Some(Token { + location: Location::new(String::from("src/statement.sql"), 7, 6), + data: TokenData::Select, + lexeme: String::from("Select"), + }) + ); + assert_eq!( + scanresult.pop(), + Some(Token { + location: Location::new(String::from("src/statement.sql"), 13, 0), + data: TokenData::EndOfFile, + lexeme: String::from(""), + }) + ); + assert_eq!(scanresult.pop(), None); + assert!(scanresult.is_empty()); +} + +#[test] +fn test_tokenizer_errors() { + let mut scanerrors = tokenize("salact +".to_string(), "src/statement.sql".to_string()) + .err() + .unwrap(); + scanerrors.reverse(); + assert_eq!( + scanerrors.pop(), + Some(ScanError { + location: Location { + file: "src/statement.sql".to_string(), + offset: 0, + length: 6, + }, + kind: ScanErrorKind::UnknownKeyword("salact".to_string()), + }) + ); + assert_eq!( + scanerrors.pop(), + Some(ScanError { + location: Location { + file: "src/statement.sql".to_string(), + offset: 8, + length: 1, + }, + kind: ScanErrorKind::UnexpectedChar('+'), + }) + ); + assert!(scanerrors.is_empty()); +}