feat(parser): add tokenizer (not actually called)
This commit is contained in:
parent
5b6d878208
commit
e841a1779e
2 changed files with 259 additions and 0 deletions
|
|
@ -3,3 +3,4 @@ pub mod cli;
|
|||
pub mod command;
|
||||
pub mod meta_commands;
|
||||
pub mod statements;
|
||||
pub mod tokens;
|
||||
|
|
|
|||
258
src/tokens.rs
Normal file
258
src/tokens.rs
Normal file
|
|
@ -0,0 +1,258 @@
|
|||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub enum TokenData {
|
||||
/// INSERT
|
||||
Insert,
|
||||
/// SELECT
|
||||
Select,
|
||||
/// 0, 1, -21635, 867463
|
||||
Integer(i64),
|
||||
/// Hello World!
|
||||
String(String),
|
||||
/// No file O.O?
|
||||
EndOfFile,
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub struct Location {
|
||||
/// file name
|
||||
pub file: String,
|
||||
/// Since start of file
|
||||
pub offset: usize,
|
||||
/// Length of the litteral
|
||||
pub length: usize,
|
||||
}
|
||||
|
||||
impl Location {
|
||||
/// ```
|
||||
/// use osdb::tokens::Location;
|
||||
/// let location = Location::new(String::from("src/statement.sql"), 0, 10);
|
||||
/// assert_eq!(location.file, "src/statement.sql");
|
||||
/// assert_eq!(location.offset, 0);
|
||||
/// assert_eq!(location.length, 10);
|
||||
/// ```
|
||||
pub fn new(file: String, offset: usize, length: usize) -> Self {
|
||||
Self {
|
||||
file,
|
||||
offset,
|
||||
length,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub struct Token {
|
||||
/// Where in the input was this token found?
|
||||
pub location: Location,
|
||||
/// What is in it?
|
||||
pub data: TokenData,
|
||||
/// What did it look like while being parsed?
|
||||
pub lexeme: String,
|
||||
}
|
||||
|
||||
struct Tokenizer {
|
||||
input: String,
|
||||
file: String,
|
||||
tokens: Vec<Token>,
|
||||
offset: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub enum ScanErrorKind {
|
||||
UnexpectedChar(char),
|
||||
UnexpectedEndOfInput,
|
||||
UnknownKeyword(String),
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub struct ScanError {
|
||||
pub location: Location,
|
||||
pub kind: ScanErrorKind,
|
||||
}
|
||||
|
||||
impl Tokenizer {
|
||||
fn new(input: String, file: String) -> Self {
|
||||
Self {
|
||||
input,
|
||||
file,
|
||||
tokens: Vec::new(),
|
||||
offset: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn current_location(&self, length: usize) -> Location {
|
||||
Location::new(self.file.clone(), self.offset, length)
|
||||
}
|
||||
|
||||
fn is_at_end(&self) -> bool {
|
||||
self.offset >= self.input.len()
|
||||
}
|
||||
|
||||
fn peek(&self) -> Option<char> {
|
||||
self.input.chars().nth(self.offset)
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> Option<char> {
|
||||
let c = self.input.chars().nth(self.offset);
|
||||
self.offset += 1;
|
||||
c
|
||||
}
|
||||
|
||||
fn recognize_keyword(word: &str) -> Option<TokenData> {
|
||||
match word.to_lowercase().as_str() {
|
||||
"insert" => Some(TokenData::Insert),
|
||||
"select" => Some(TokenData::Select),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn scan_identifier_or_keyword(&mut self) -> Result<Token, ScanError> {
|
||||
let start_offset = self.offset;
|
||||
let mut word = String::new();
|
||||
let mut length = 0;
|
||||
if let Some(c) = self.advance() {
|
||||
word.push(c);
|
||||
length += 1;
|
||||
}
|
||||
while let Some(c) = self.peek() {
|
||||
if Self::ident_or_keyword_inner(c) {
|
||||
word.push(c);
|
||||
self.advance();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
length += 1;
|
||||
}
|
||||
if let Some(keyword) = Self::recognize_keyword(&word) {
|
||||
Ok(Token {
|
||||
location: Location::new(self.file.clone(), start_offset, length),
|
||||
data: keyword,
|
||||
lexeme: word,
|
||||
})
|
||||
} else {
|
||||
Err(ScanError {
|
||||
location: Location::new(self.file.clone(), start_offset, length),
|
||||
kind: ScanErrorKind::UnknownKeyword(word),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn ident_or_keyword_start(c: char) -> bool {
|
||||
c.is_alphabetic() || c == '_'
|
||||
}
|
||||
|
||||
fn ident_or_keyword_inner(c: char) -> bool {
|
||||
c.is_alphanumeric() || c == '_'
|
||||
}
|
||||
|
||||
fn scan_token(&mut self) -> Result<Token, ScanError> {
|
||||
loop {
|
||||
if let Some(c) = self.peek() {
|
||||
if Self::ident_or_keyword_start(c) {
|
||||
return self.scan_identifier_or_keyword();
|
||||
} else if c.is_whitespace() {
|
||||
self.advance();
|
||||
} else {
|
||||
self.advance();
|
||||
return Err(ScanError {
|
||||
location: self.current_location(1),
|
||||
kind: ScanErrorKind::UnexpectedChar(c),
|
||||
});
|
||||
}
|
||||
} else {
|
||||
return Err(ScanError {
|
||||
location: self.current_location(0),
|
||||
kind: ScanErrorKind::UnexpectedEndOfInput,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn finalize(&mut self) {
|
||||
self.tokens.push(Token {
|
||||
location: self.current_location(0),
|
||||
data: TokenData::EndOfFile,
|
||||
lexeme: String::new(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
pub fn tokenize(input: String, file: String) -> Result<Vec<Token>, Vec<ScanError>> {
|
||||
let mut tokenizer = Tokenizer::new(input, file);
|
||||
let mut errors = Vec::new();
|
||||
while !tokenizer.is_at_end() {
|
||||
match tokenizer.scan_token() {
|
||||
Ok(token) => tokenizer.tokens.push(token),
|
||||
Err(err) => errors.push(err),
|
||||
}
|
||||
}
|
||||
tokenizer.finalize();
|
||||
if errors.is_empty() {
|
||||
Ok(tokenizer.tokens)
|
||||
} else {
|
||||
Err(errors)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer() {
|
||||
let mut scanresult =
|
||||
tokenize("INSERT Select".to_string(), "src/statement.sql".to_string()).unwrap();
|
||||
scanresult.reverse();
|
||||
assert_eq!(
|
||||
scanresult.pop(),
|
||||
Some(Token {
|
||||
location: Location::new(String::from("src/statement.sql"), 0, 6),
|
||||
data: TokenData::Insert,
|
||||
lexeme: String::from("INSERT"),
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
scanresult.pop(),
|
||||
Some(Token {
|
||||
location: Location::new(String::from("src/statement.sql"), 7, 6),
|
||||
data: TokenData::Select,
|
||||
lexeme: String::from("Select"),
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
scanresult.pop(),
|
||||
Some(Token {
|
||||
location: Location::new(String::from("src/statement.sql"), 13, 0),
|
||||
data: TokenData::EndOfFile,
|
||||
lexeme: String::from(""),
|
||||
})
|
||||
);
|
||||
assert_eq!(scanresult.pop(), None);
|
||||
assert!(scanresult.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_errors() {
|
||||
let mut scanerrors = tokenize("salact +".to_string(), "src/statement.sql".to_string())
|
||||
.err()
|
||||
.unwrap();
|
||||
scanerrors.reverse();
|
||||
assert_eq!(
|
||||
scanerrors.pop(),
|
||||
Some(ScanError {
|
||||
location: Location {
|
||||
file: "src/statement.sql".to_string(),
|
||||
offset: 0,
|
||||
length: 6,
|
||||
},
|
||||
kind: ScanErrorKind::UnknownKeyword("salact".to_string()),
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
scanerrors.pop(),
|
||||
Some(ScanError {
|
||||
location: Location {
|
||||
file: "src/statement.sql".to_string(),
|
||||
offset: 8,
|
||||
length: 1,
|
||||
},
|
||||
kind: ScanErrorKind::UnexpectedChar('+'),
|
||||
})
|
||||
);
|
||||
assert!(scanerrors.is_empty());
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue