feat(tokenizer): parse integers

2025-05-10 10:55:44 +02:00 · 2025-05-10 10:55:44 +02:00 · f259b079b7
commit f259b079b7
parent 71a9d82d96
13 changed files with 311 additions and 27 deletions
--- a/notes.org
+++ b/notes.org
@ -214,6 +214,10 @@ i will use rustyline, since it seems like the most feature-complete

 * STRT parse integers

+** TODO Function to get a token until condition is false
+
+** TODO Parse the integer
+
 * TODO parse strings

 * WAIT cli tests using insta-cmd
--- a/src/error_display.rs
+++ b/src/error_display.rs
@ -11,7 +11,20 @@ impl OSDBError for CommandParseError {
            CommandParseError::Scan(x) => {
                x.display(file, input);
            }
-            _ => todo!(),
+            CommandParseError::UnexpectedToken(token, items) => {
+                let location = (file, Into::<std::ops::Range<usize>>::into(&token.location));
+                Report::build(ReportKind::Error, location.clone())
+                    .with_message("unexpected token")
+                    .with_label(
+                        Label::new(location.clone())
+                            .with_color(Color::Red)
+                            .with_message(format!("found {token}")),
+                    )
+                    .with_note(format!("expected token type to be one of {items:?}"))
+                    .finish()
+                    .print((file, Source::from(input)))
+                    .unwrap()
+            }
        }
    }
 }
@ -26,7 +39,6 @@ impl OSDBError for ScanError {
                    .with_color(Color::Red)
                    .with_message(format!("{self}")),
            )
-            .with_help("Make sure you don't have any typos or unexpected characters.")
            .finish()
            .print((file, Source::from(input)))
            .unwrap();
--- a/src/meta_commands.rs
+++ b/src/meta_commands.rs
@ -3,6 +3,14 @@ pub enum MetaCommand {
    Exit,
 }

+impl std::fmt::Display for MetaCommand {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match self {
+            MetaCommand::Exit => write!(f, "exit"),
+        }
+    }
+}
+
 pub struct MetaCommandExecuteResult {
    pub should_exit: bool,
 }
--- a/src/snapshots/osdbtokenstests__tokenizer_errors.snap
+++ b/src/snapshots/osdbtokenstests__tokenizer_errors.snap
@ -1,8 +1,9 @@
 ---
 source: src/tokens.rs
-expression: scanerrors
+expression: "tokenize(\"salact +\".to_string(), \"src/statement.sql\".to_string())"
 ---
-[
+Err(
+    [
        ScanError {
            location: Location {
                file: "src/statement.sql",
@ -19,8 +20,11 @@ expression: scanerrors
                offset: 7,
                length: 1,
            },
-        kind: UnexpectedChar(
-            '+',
+            kind: ParseIntError(
+                ParseIntError {
+                    kind: InvalidDigit,
+                },
            ),
        },
-]
+    ],
+)
--- a/src/snapshots/osdbtokenstests__tokenizer_integers-2.snap
+++ b/src/snapshots/osdbtokenstests__tokenizer_integers-2.snap
@ -0,0 +1,28 @@
+---
+source: src/tokens.rs
+expression: "tokenize(\"-10\".to_string(), \"src/ints.sql\".to_string(),)"
+---
+Ok(
+    [
+        Token {
+            location: Location {
+                file: "src/ints.sql",
+                offset: 0,
+                length: 3,
+            },
+            data: Int(
+                -10,
+            ),
+            lexeme: "-10",
+        },
+        Token {
+            location: Location {
+                file: "src/ints.sql",
+                offset: 3,
+                length: 0,
+            },
+            data: EndOfFile,
+            lexeme: "",
+        },
+    ],
+)
--- a/src/snapshots/osdbtokenstests__tokenizer_integers-3.snap
+++ b/src/snapshots/osdbtokenstests__tokenizer_integers-3.snap
@ -0,0 +1,28 @@
+---
+source: src/tokens.rs
+expression: "tokenize(\"0\".to_string(), \"src/ints.sql\".to_string(),)"
+---
+Ok(
+    [
+        Token {
+            location: Location {
+                file: "src/ints.sql",
+                offset: 0,
+                length: 1,
+            },
+            data: Int(
+                0,
+            ),
+            lexeme: "0",
+        },
+        Token {
+            location: Location {
+                file: "src/ints.sql",
+                offset: 1,
+                length: 0,
+            },
+            data: EndOfFile,
+            lexeme: "",
+        },
+    ],
+)
--- a/src/snapshots/osdbtokenstests__tokenizer_integers-4.snap
+++ b/src/snapshots/osdbtokenstests__tokenizer_integers-4.snap
@ -0,0 +1,28 @@
+---
+source: src/tokens.rs
+expression: "tokenize(\"-0\".to_string(), \"src/ints.sql\".to_string(),)"
+---
+Ok(
+    [
+        Token {
+            location: Location {
+                file: "src/ints.sql",
+                offset: 0,
+                length: 2,
+            },
+            data: Int(
+                0,
+            ),
+            lexeme: "-0",
+        },
+        Token {
+            location: Location {
+                file: "src/ints.sql",
+                offset: 2,
+                length: 0,
+            },
+            data: EndOfFile,
+            lexeme: "",
+        },
+    ],
+)
--- a/src/snapshots/osdbtokenstests__tokenizer_integers-5.snap
+++ b/src/snapshots/osdbtokenstests__tokenizer_integers-5.snap
@ -0,0 +1,20 @@
+---
+source: src/tokens.rs
+expression: "tokenize(\"--0\".to_string(), \"src/ints.sql\".to_string(),)"
+---
+Err(
+    [
+        ScanError {
+            location: Location {
+                file: "src/ints.sql",
+                offset: 0,
+                length: 3,
+            },
+            kind: ParseIntError(
+                ParseIntError {
+                    kind: InvalidDigit,
+                },
+            ),
+        },
+    ],
+)
--- a/src/snapshots/osdbtokenstests__tokenizer_integers-6.snap
+++ b/src/snapshots/osdbtokenstests__tokenizer_integers-6.snap
@ -0,0 +1,20 @@
+---
+source: src/tokens.rs
+expression: "tokenize(\"++0\".to_string(), \"src/ints.sql\".to_string(),)"
+---
+Err(
+    [
+        ScanError {
+            location: Location {
+                file: "src/ints.sql",
+                offset: 0,
+                length: 3,
+            },
+            kind: ParseIntError(
+                ParseIntError {
+                    kind: InvalidDigit,
+                },
+            ),
+        },
+    ],
+)
--- a/src/snapshots/osdbtokenstests__tokenizer_integers-7.snap
+++ b/src/snapshots/osdbtokenstests__tokenizer_integers-7.snap
@ -0,0 +1,20 @@
+---
+source: src/tokens.rs
+expression: "tokenize(\"-\".to_string(), \"src/ints.sql\".to_string(),)"
+---
+Err(
+    [
+        ScanError {
+            location: Location {
+                file: "src/ints.sql",
+                offset: 0,
+                length: 1,
+            },
+            kind: ParseIntError(
+                ParseIntError {
+                    kind: InvalidDigit,
+                },
+            ),
+        },
+    ],
+)
--- a/src/snapshots/osdbtokenstests__tokenizer_integers-8.snap
+++ b/src/snapshots/osdbtokenstests__tokenizer_integers-8.snap
@ -0,0 +1,20 @@
+---
+source: src/tokens.rs
+expression: "tokenize(\"+\".to_string(), \"src/ints.sql\".to_string(),)"
+---
+Err(
+    [
+        ScanError {
+            location: Location {
+                file: "src/ints.sql",
+                offset: 0,
+                length: 1,
+            },
+            kind: ParseIntError(
+                ParseIntError {
+                    kind: InvalidDigit,
+                },
+            ),
+        },
+    ],
+)
--- a/src/snapshots/osdbtokenstests__tokenizer_integers.snap
+++ b/src/snapshots/osdbtokenstests__tokenizer_integers.snap
@ -0,0 +1,28 @@
+---
+source: src/tokens.rs
+expression: "tokenize(\"10\".to_string(), \"src/ints.sql\".to_string(),)"
+---
+Ok(
+    [
+        Token {
+            location: Location {
+                file: "src/ints.sql",
+                offset: 0,
+                length: 2,
+            },
+            data: Int(
+                10,
+            ),
+            lexeme: "10",
+        },
+        Token {
+            location: Location {
+                file: "src/ints.sql",
+                offset: 2,
+                length: 0,
+            },
+            data: EndOfFile,
+            lexeme: "",
+        },
+    ],
+)
--- a/src/tokens.rs
+++ b/src/tokens.rs
@ -55,6 +55,20 @@ pub struct Token {
    pub lexeme: String,
 }

+impl std::fmt::Display for Token {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match &self.data {
+            TokenData::Insert => write!(f, "insert statement"),
+            TokenData::Select => write!(f, "select statement"),
+            TokenData::MetaCommand(x) => write!(f, "meta-command {x}"),
+            TokenData::EndOfFile => write!(f, "end of file"),
+            TokenData::Int(x) => write!(f, "integer {x}"),
+        }?;
+        let lexeme = &self.lexeme;
+        write!(f, " {lexeme:?}")
+    }
+}
+
 struct Tokenizer {
    input: String,
    file: String,
@ -68,6 +82,7 @@ pub enum ScanErrorKind {
    UnexpectedEndOfInput,
    UnknownKeyword(String),
    UnknownMetaCommand(String),
+    ParseIntError(std::num::ParseIntError),
 }

 impl std::fmt::Display for ScanErrorKind {
@ -77,6 +92,7 @@ impl std::fmt::Display for ScanErrorKind {
            ScanErrorKind::UnexpectedEndOfInput => write!(f, "unexpected end of input"),
            ScanErrorKind::UnknownKeyword(x) => write!(f, "unknown keyword: {x:?}"),
            ScanErrorKind::UnknownMetaCommand(x) => write!(f, "unknown meta-command: {x:?}"),
+            ScanErrorKind::ParseIntError(x) => write!(f, "failed to parse integer: {x}"),
        }
    }
 }
@ -207,6 +223,40 @@ impl Tokenizer {
        c.is_alphanumeric() || c == '_'
    }

+    fn digit(c: char) -> bool {
+        c.is_ascii_digit() || c == '-' || c == '+'
+    }
+
+    fn scan_integer(&mut self) -> Result<Token, ScanError> {
+        let start_offset = self.offset;
+        let mut word = String::new();
+        let mut length = 0;
+        if let Some(c) = self.advance() {
+            word.push(c);
+            length += 1;
+        }
+        while let Some(c) = self.peek() {
+            if Self::digit(c) {
+                word.push(c);
+                self.advance();
+            } else {
+                break;
+            }
+            length += 1;
+        }
+        match word.parse::<i64>() {
+            Ok(int) => Ok(Token {
+                location: Location::new(self.file.clone(), start_offset, length),
+                data: TokenData::Int(int),
+                lexeme: word,
+            }),
+            Err(e) => Err(ScanError {
+                location: Location::new(self.file.clone(), start_offset, length),
+                kind: ScanErrorKind::ParseIntError(e),
+            }),
+        }
+    }
+
    fn scan_token(&mut self) -> Result<Option<Token>, ScanError> {
        loop {
            if let Some(c) = self.peek() {
@ -214,6 +264,8 @@ impl Tokenizer {
                    return self.scan_identifier_or_keyword().map(Some);
                } else if c == '.' {
                    return self.scan_meta_command().map(Some);
+                } else if Self::digit(c) {
+                    return self.scan_integer().map(Some);
                } else if c.is_whitespace() {
                    self.advance();
                } else {
@ -308,9 +360,21 @@ mod tests {

    #[test]
    fn test_tokenizer_errors() {
-        let scanerrors = tokenize("salact +".to_string(), "src/statement.sql".to_string())
-            .err()
-            .unwrap();
-        assert_debug_snapshot!(scanerrors);
+        assert_debug_snapshot!(tokenize(
+            "salact +".to_string(),
+            "src/statement.sql".to_string()
+        ));
+    }
+
+    #[test]
+    fn test_tokenizer_integers() {
+        assert_debug_snapshot!(tokenize("10".to_string(), "src/ints.sql".to_string(),));
+        assert_debug_snapshot!(tokenize("-10".to_string(), "src/ints.sql".to_string(),));
+        assert_debug_snapshot!(tokenize("0".to_string(), "src/ints.sql".to_string(),));
+        assert_debug_snapshot!(tokenize("-0".to_string(), "src/ints.sql".to_string(),));
+        assert_debug_snapshot!(tokenize("--0".to_string(), "src/ints.sql".to_string(),));
+        assert_debug_snapshot!(tokenize("++0".to_string(), "src/ints.sql".to_string(),));
+        assert_debug_snapshot!(tokenize("-".to_string(), "src/ints.sql".to_string(),));
+        assert_debug_snapshot!(tokenize("+".to_string(), "src/ints.sql".to_string(),));
    }
 }