316 lines
8.7 KiB
Rust
316 lines
8.7 KiB
Rust
use crate::meta_commands::MetaCommand;
|
|
|
|
#[derive(Debug, Eq, PartialEq)]
|
|
pub enum TokenData {
|
|
Insert,
|
|
Select,
|
|
MetaCommand(MetaCommand),
|
|
EndOfFile,
|
|
Int(i64),
|
|
}
|
|
|
|
#[derive(Debug, Eq, PartialEq)]
|
|
pub struct Location {
|
|
/// file name
|
|
pub file: String,
|
|
/// Since start of file
|
|
pub offset: usize,
|
|
/// Length of the litteral
|
|
pub length: usize,
|
|
}
|
|
|
|
impl From<&Location> for std::ops::Range<usize> {
|
|
fn from(val: &Location) -> Self {
|
|
std::ops::Range {
|
|
start: val.offset,
|
|
end: val.offset + val.length,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Location {
|
|
/// ```
|
|
/// use osdb::tokens::Location;
|
|
/// let location = Location::new(String::from("src/statement.sql"), 0, 10);
|
|
/// assert_eq!(location.file, "src/statement.sql");
|
|
/// assert_eq!(location.offset, 0);
|
|
/// assert_eq!(location.length, 10);
|
|
/// ```
|
|
pub fn new(file: String, offset: usize, length: usize) -> Self {
|
|
Self {
|
|
file,
|
|
offset,
|
|
length,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Eq, PartialEq)]
|
|
pub struct Token {
|
|
/// Where in the input was this token found?
|
|
pub location: Location,
|
|
/// What is in it?
|
|
pub data: TokenData,
|
|
/// What did it look like while being parsed?
|
|
pub lexeme: String,
|
|
}
|
|
|
|
struct Tokenizer {
|
|
input: String,
|
|
file: String,
|
|
tokens: Vec<Token>,
|
|
offset: usize,
|
|
}
|
|
|
|
#[derive(Debug, Eq, PartialEq)]
|
|
pub enum ScanErrorKind {
|
|
UnexpectedChar(char),
|
|
UnexpectedEndOfInput,
|
|
UnknownKeyword(String),
|
|
UnknownMetaCommand(String),
|
|
}
|
|
|
|
impl std::fmt::Display for ScanErrorKind {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
match self {
|
|
ScanErrorKind::UnexpectedChar(c) => write!(f, "unexpected char: {c:?}"),
|
|
ScanErrorKind::UnexpectedEndOfInput => write!(f, "unexpected end of input"),
|
|
ScanErrorKind::UnknownKeyword(x) => write!(f, "unknown keyword: {x:?}"),
|
|
ScanErrorKind::UnknownMetaCommand(x) => write!(f, "unknown meta-command: {x:?}"),
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Eq, PartialEq)]
|
|
pub struct ScanError {
|
|
pub location: Location,
|
|
pub kind: ScanErrorKind,
|
|
}
|
|
|
|
impl std::fmt::Display for ScanError {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
let kind = &self.kind;
|
|
write!(f, "{kind}")
|
|
}
|
|
}
|
|
|
|
impl Tokenizer {
|
|
fn new(input: String, file: String) -> Self {
|
|
Self {
|
|
input,
|
|
file,
|
|
tokens: Vec::new(),
|
|
offset: 0,
|
|
}
|
|
}
|
|
|
|
fn current_location(&self, length: usize) -> Location {
|
|
Location::new(self.file.clone(), self.offset, length)
|
|
}
|
|
|
|
fn is_at_end(&self) -> bool {
|
|
self.offset >= self.input.len()
|
|
}
|
|
|
|
fn peek(&self) -> Option<char> {
|
|
self.input.chars().nth(self.offset)
|
|
}
|
|
|
|
fn advance(&mut self) -> Option<char> {
|
|
let c = self.input.chars().nth(self.offset);
|
|
self.offset += 1;
|
|
c
|
|
}
|
|
|
|
fn recognize_keyword(word: &str) -> Option<TokenData> {
|
|
match word.to_lowercase().as_str() {
|
|
"insert" => Some(TokenData::Insert),
|
|
"select" => Some(TokenData::Select),
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
fn recognize_metacommand(word: &str) -> Option<TokenData> {
|
|
match word.to_lowercase().as_str() {
|
|
".exit" => Some(TokenData::MetaCommand(MetaCommand::Exit)),
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
fn scan_meta_command(&mut self) -> Result<Token, ScanError> {
|
|
let start_offset = self.offset;
|
|
let mut word = String::new();
|
|
let mut length = 0;
|
|
if let Some(c) = self.advance() {
|
|
word.push(c);
|
|
length += 1;
|
|
}
|
|
while let Some(c) = self.peek() {
|
|
if c.is_alphabetic() || c == '_' {
|
|
word.push(c);
|
|
self.advance();
|
|
} else {
|
|
break;
|
|
}
|
|
length += 1;
|
|
}
|
|
if let Some(meta) = Self::recognize_metacommand(&word) {
|
|
Ok(Token {
|
|
location: Location::new(self.file.clone(), start_offset, length),
|
|
data: meta,
|
|
lexeme: word,
|
|
})
|
|
} else {
|
|
Err(ScanError {
|
|
location: Location::new(self.file.clone(), start_offset, length),
|
|
kind: ScanErrorKind::UnknownMetaCommand(word),
|
|
})
|
|
}
|
|
}
|
|
|
|
fn scan_identifier_or_keyword(&mut self) -> Result<Token, ScanError> {
|
|
let start_offset = self.offset;
|
|
let mut word = String::new();
|
|
let mut length = 0;
|
|
if let Some(c) = self.advance() {
|
|
word.push(c);
|
|
length += 1;
|
|
}
|
|
while let Some(c) = self.peek() {
|
|
if Self::ident_or_keyword_inner(c) {
|
|
word.push(c);
|
|
self.advance();
|
|
} else {
|
|
break;
|
|
}
|
|
length += 1;
|
|
}
|
|
if let Some(keyword) = Self::recognize_keyword(&word) {
|
|
Ok(Token {
|
|
location: Location::new(self.file.clone(), start_offset, length),
|
|
data: keyword,
|
|
lexeme: word,
|
|
})
|
|
} else {
|
|
Err(ScanError {
|
|
location: Location::new(self.file.clone(), start_offset, length),
|
|
kind: ScanErrorKind::UnknownKeyword(word),
|
|
})
|
|
}
|
|
}
|
|
|
|
fn ident_or_keyword_start(c: char) -> bool {
|
|
c.is_alphabetic() || c == '_'
|
|
}
|
|
|
|
fn ident_or_keyword_inner(c: char) -> bool {
|
|
c.is_alphanumeric() || c == '_'
|
|
}
|
|
|
|
fn scan_token(&mut self) -> Result<Option<Token>, ScanError> {
|
|
loop {
|
|
if let Some(c) = self.peek() {
|
|
if Self::ident_or_keyword_start(c) {
|
|
return self.scan_identifier_or_keyword().map(Some);
|
|
} else if c == '.' {
|
|
return self.scan_meta_command().map(Some);
|
|
} else if c.is_whitespace() {
|
|
self.advance();
|
|
} else {
|
|
let result = Err(ScanError {
|
|
location: self.current_location(1),
|
|
kind: ScanErrorKind::UnexpectedChar(c),
|
|
});
|
|
self.advance();
|
|
return result;
|
|
}
|
|
} else {
|
|
return Ok(None);
|
|
}
|
|
}
|
|
}
|
|
|
|
fn finalize(&mut self) {
|
|
self.tokens.push(Token {
|
|
location: self.current_location(0),
|
|
data: TokenData::EndOfFile,
|
|
lexeme: String::new(),
|
|
});
|
|
}
|
|
}
|
|
|
|
pub fn tokenize(input: String, file: String) -> Result<Vec<Token>, Vec<ScanError>> {
|
|
let mut tokenizer = Tokenizer::new(input, file);
|
|
let mut errors = Vec::new();
|
|
while !tokenizer.is_at_end() {
|
|
let token = tokenizer.scan_token();
|
|
match token {
|
|
Ok(Some(token)) => tokenizer.tokens.push(token),
|
|
Ok(None) => break,
|
|
Err(err) => errors.push(err),
|
|
}
|
|
}
|
|
tokenizer.finalize();
|
|
if errors.is_empty() {
|
|
Ok(tokenizer.tokens)
|
|
} else {
|
|
Err(errors)
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use insta::assert_debug_snapshot;
|
|
|
|
#[test]
|
|
fn test_tokenize_meta_command() {
|
|
assert_debug_snapshot!(tokenize(".exit".to_string(), "<stdin>".to_string()));
|
|
}
|
|
|
|
#[test]
|
|
fn test_tokenize_unknown_meta_command() {
|
|
assert_debug_snapshot!(tokenize(".halp".to_string(), "<stdin>".to_string()));
|
|
}
|
|
|
|
#[test]
|
|
fn test_tokenizer() {
|
|
let mut scanresult =
|
|
tokenize("INSERT Select".to_string(), "src/statement.sql".to_string()).unwrap();
|
|
scanresult.reverse();
|
|
assert_eq!(
|
|
scanresult.pop(),
|
|
Some(Token {
|
|
location: Location::new(String::from("src/statement.sql"), 0, 6),
|
|
data: TokenData::Insert,
|
|
lexeme: String::from("INSERT"),
|
|
})
|
|
);
|
|
assert_eq!(
|
|
scanresult.pop(),
|
|
Some(Token {
|
|
location: Location::new(String::from("src/statement.sql"), 7, 6),
|
|
data: TokenData::Select,
|
|
lexeme: String::from("Select"),
|
|
})
|
|
);
|
|
assert_eq!(
|
|
scanresult.pop(),
|
|
Some(Token {
|
|
location: Location::new(String::from("src/statement.sql"), 13, 0),
|
|
data: TokenData::EndOfFile,
|
|
lexeme: String::from(""),
|
|
})
|
|
);
|
|
assert_eq!(scanresult.pop(), None);
|
|
assert!(scanresult.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_tokenizer_errors() {
|
|
let scanerrors = tokenize("salact +".to_string(), "src/statement.sql".to_string())
|
|
.err()
|
|
.unwrap();
|
|
assert_debug_snapshot!(scanerrors);
|
|
}
|
|
}
|