osdb/src/tokens.rs

259 lines
6.7 KiB
Rust
Raw Normal View History

#[derive(Debug, Eq, PartialEq)]
pub enum TokenData {
/// INSERT
Insert,
/// SELECT
Select,
/// 0, 1, -21635, 867463
Integer(i64),
/// Hello World!
String(String),
/// No file O.O?
EndOfFile,
}
#[derive(Debug, Eq, PartialEq)]
pub struct Location {
/// file name
pub file: String,
/// Since start of file
pub offset: usize,
/// Length of the litteral
pub length: usize,
}
impl Location {
/// ```
/// use osdb::tokens::Location;
/// let location = Location::new(String::from("src/statement.sql"), 0, 10);
/// assert_eq!(location.file, "src/statement.sql");
/// assert_eq!(location.offset, 0);
/// assert_eq!(location.length, 10);
/// ```
pub fn new(file: String, offset: usize, length: usize) -> Self {
Self {
file,
offset,
length,
}
}
}
#[derive(Debug, Eq, PartialEq)]
pub struct Token {
/// Where in the input was this token found?
pub location: Location,
/// What is in it?
pub data: TokenData,
/// What did it look like while being parsed?
pub lexeme: String,
}
struct Tokenizer {
input: String,
file: String,
tokens: Vec<Token>,
offset: usize,
}
#[derive(Debug, Eq, PartialEq)]
pub enum ScanErrorKind {
UnexpectedChar(char),
UnexpectedEndOfInput,
UnknownKeyword(String),
}
#[derive(Debug, Eq, PartialEq)]
pub struct ScanError {
pub location: Location,
pub kind: ScanErrorKind,
}
impl Tokenizer {
fn new(input: String, file: String) -> Self {
Self {
input,
file,
tokens: Vec::new(),
offset: 0,
}
}
fn current_location(&self, length: usize) -> Location {
Location::new(self.file.clone(), self.offset, length)
}
fn is_at_end(&self) -> bool {
self.offset >= self.input.len()
}
fn peek(&self) -> Option<char> {
self.input.chars().nth(self.offset)
}
fn advance(&mut self) -> Option<char> {
let c = self.input.chars().nth(self.offset);
self.offset += 1;
c
}
fn recognize_keyword(word: &str) -> Option<TokenData> {
match word.to_lowercase().as_str() {
"insert" => Some(TokenData::Insert),
"select" => Some(TokenData::Select),
_ => None,
}
}
fn scan_identifier_or_keyword(&mut self) -> Result<Token, ScanError> {
let start_offset = self.offset;
let mut word = String::new();
let mut length = 0;
if let Some(c) = self.advance() {
word.push(c);
length += 1;
}
while let Some(c) = self.peek() {
if Self::ident_or_keyword_inner(c) {
word.push(c);
self.advance();
} else {
break;
}
length += 1;
}
if let Some(keyword) = Self::recognize_keyword(&word) {
Ok(Token {
location: Location::new(self.file.clone(), start_offset, length),
data: keyword,
lexeme: word,
})
} else {
Err(ScanError {
location: Location::new(self.file.clone(), start_offset, length),
kind: ScanErrorKind::UnknownKeyword(word),
})
}
}
fn ident_or_keyword_start(c: char) -> bool {
c.is_alphabetic() || c == '_'
}
fn ident_or_keyword_inner(c: char) -> bool {
c.is_alphanumeric() || c == '_'
}
fn scan_token(&mut self) -> Result<Token, ScanError> {
loop {
if let Some(c) = self.peek() {
if Self::ident_or_keyword_start(c) {
return self.scan_identifier_or_keyword();
} else if c.is_whitespace() {
self.advance();
} else {
self.advance();
return Err(ScanError {
location: self.current_location(1),
kind: ScanErrorKind::UnexpectedChar(c),
});
}
} else {
return Err(ScanError {
location: self.current_location(0),
kind: ScanErrorKind::UnexpectedEndOfInput,
});
}
}
}
fn finalize(&mut self) {
self.tokens.push(Token {
location: self.current_location(0),
data: TokenData::EndOfFile,
lexeme: String::new(),
});
}
}
pub fn tokenize(input: String, file: String) -> Result<Vec<Token>, Vec<ScanError>> {
let mut tokenizer = Tokenizer::new(input, file);
let mut errors = Vec::new();
while !tokenizer.is_at_end() {
match tokenizer.scan_token() {
Ok(token) => tokenizer.tokens.push(token),
Err(err) => errors.push(err),
}
}
tokenizer.finalize();
if errors.is_empty() {
Ok(tokenizer.tokens)
} else {
Err(errors)
}
}
#[test]
fn test_tokenizer() {
let mut scanresult =
tokenize("INSERT Select".to_string(), "src/statement.sql".to_string()).unwrap();
scanresult.reverse();
assert_eq!(
scanresult.pop(),
Some(Token {
location: Location::new(String::from("src/statement.sql"), 0, 6),
data: TokenData::Insert,
lexeme: String::from("INSERT"),
})
);
assert_eq!(
scanresult.pop(),
Some(Token {
location: Location::new(String::from("src/statement.sql"), 7, 6),
data: TokenData::Select,
lexeme: String::from("Select"),
})
);
assert_eq!(
scanresult.pop(),
Some(Token {
location: Location::new(String::from("src/statement.sql"), 13, 0),
data: TokenData::EndOfFile,
lexeme: String::from(""),
})
);
assert_eq!(scanresult.pop(), None);
assert!(scanresult.is_empty());
}
#[test]
fn test_tokenizer_errors() {
let mut scanerrors = tokenize("salact +".to_string(), "src/statement.sql".to_string())
.err()
.unwrap();
scanerrors.reverse();
assert_eq!(
scanerrors.pop(),
Some(ScanError {
location: Location {
file: "src/statement.sql".to_string(),
offset: 0,
length: 6,
},
kind: ScanErrorKind::UnknownKeyword("salact".to_string()),
})
);
assert_eq!(
scanerrors.pop(),
Some(ScanError {
location: Location {
file: "src/statement.sql".to_string(),
offset: 8,
length: 1,
},
kind: ScanErrorKind::UnexpectedChar('+'),
})
);
assert!(scanerrors.is_empty());
}