From b55135245d11d7a67080f0a69e08a56edb51349e Mon Sep 17 00:00:00 2001 From: Logan Date: Mon, 15 Jul 2024 03:22:56 -0500 Subject: [PATCH] tokens --- .gitignore | 1 + Cargo.lock | 7 ++ Cargo.toml | 6 ++ src/main.rs | 18 +++++ src/token.rs | 210 +++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 242 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/main.rs create mode 100644 src/token.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..afaae83 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "lang" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..d6cff4b --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "lang" +version = "0.1.0" +edition = "2021" + +[dependencies] diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..762f6c6 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,18 @@ +mod token; + +fn main() { + repl(); +} + +pub fn repl() { + let mut buffer = String::new(); + let stdin = std::io::stdin(); + loop { + stdin.read_line(&mut buffer).unwrap(); + let tokens = token::tokenize(&buffer); + for tok in tokens { + println!("{} : {:?}", &buffer[tok.start..tok.end], tok.ttype); + } + buffer = String::new(); + } +} diff --git a/src/token.rs b/src/token.rs new file mode 100644 index 0000000..189641a --- /dev/null +++ b/src/token.rs @@ -0,0 +1,210 @@ +#[derive(Debug, Clone)] +pub enum TokenType { + // Symbols + LeftParen, + RightParen, + LeftSquare, + RightSquare, + LeftBrace, + RightBrace, + Comma, + Dot, + Plus, + Minus, + Star, + Slash, + Semicolon, + + Bang, + BangEqual, + Equal, + DoubleEqual, + Greater, + GreaterEqual, + Less, + LessEqual, + + // Literals + String, + Character, + Number(f64), + + // Words + Ident, + And, + Or, + Self_, + Struct, + True, + False, + Fn, + If, + Else, + Nil, + Print, + Return, + Super, + Let, + While, + For, + + // Special + Unrecognized, + TooLong, +} + +/// Type, index +#[derive(Debug, Clone)] +pub struct Token { + pub ttype: TokenType, + pub start: usize, + pub end: usize, +} + +pub fn tokenize(input: &str) -> Vec { + let input_str = input; + let mut input = input.char_indices().peekable(); + let mut tokens = vec![]; + 'outer: loop { + // Find next non-whitespace line + let (start, c) = 'ws: loop { + match input.next() { + // Stop at end of input + None => break 'outer, + Some((index, character)) if !character.is_whitespace() => { + break 'ws (index, character) + }, + _ => {}, + } + }; + let mut end = start + 1; + let mut advance = || {}; + let ttype = match c { + // Match single character tokens + '(' => TokenType::LeftParen, + ')' => TokenType::RightParen, + '[' => TokenType::LeftSquare, + ']' => TokenType::RightSquare, + '{' => TokenType::LeftBrace, + '}' => TokenType::RightBrace, + ',' => TokenType::Comma, + '.' => TokenType::Dot, + '+' => TokenType::Plus, + '-' => TokenType::Minus, + '*' => TokenType::Star, + '/' => TokenType::Slash, + ';' => TokenType::Semicolon, + // Match multicharacter tokens + '!' => match input.peek() { + Some((_, '=')) => { + input.next(); + end += 1; + TokenType::BangEqual + }, + _ => TokenType::Bang, + }, + '=' => match input.peek() { + Some((_, '=')) => { + input.next(); + end += 1; + TokenType::DoubleEqual + }, + _ => TokenType::Equal, + }, + '<' => match input.peek() { + Some((_, '=')) => { + input.next(); + end += 1; + TokenType::GreaterEqual + }, + _ => TokenType::Greater, + }, + '>' => match input.peek() { + Some((_, '=')) => { + input.next(); + end += 1; + TokenType::LessEqual + }, + _ => TokenType::Less, + }, + // Match keywords, identifiers, and literals + c if c.is_alphanumeric() => 'case: { + // Scan full word + while let Some((new_end, next)) = input.peek() { + if next.is_alphanumeric() || *next == '_' { + let _ = input.next(); + } else { + end = *new_end; + break; + } + } + let word = &input_str[start..end]; + // Attempt to parse hex literal + if let Some(s) = + word.strip_prefix("0x").or_else(|| word.strip_prefix("0X")) + { + if let Ok(n) = u64::from_str_radix(s, 16) { + break 'case TokenType::Number(n as f64); + } else { + break 'case TokenType::Unrecognized; + } + } + // Attempt to parse binary literal + if let Some(s) = + word.strip_prefix("0b").or_else(|| word.strip_prefix("0B")) + { + if let Ok(n) = u64::from_str_radix(s, 2) { + break 'case TokenType::Number(n as f64); + } else { + break 'case TokenType::Unrecognized; + } + } + // Attempt to parse decimal literal + if let Ok(f) = word.parse::() { + break 'case TokenType::Number(f); + } + // Parse keyword or ident + match word { + "and" => TokenType::And, + "or" => TokenType::Or, + "self" => TokenType::Self_, + "struct" => TokenType::Struct, + "true" => TokenType::True, + "false" => TokenType::False, + "fn" => TokenType::Fn, + "if" => TokenType::If, + "else" => TokenType::Else, + "nil" => TokenType::Nil, + "print" => TokenType::Print, + "return" => TokenType::Return, + "super" => TokenType::Super, + "let" => TokenType::Let, + "while" => TokenType::While, + "for" => TokenType::For, + _ => TokenType::Ident, + } + }, + // Parse string + '"' => { + while let Some((new_end, next)) = input.next() { + match next { + '"' => { + end = new_end + 1; + break; + }, + // Skip escapes and deal with them later + '\\' => { + let _ = input.next(); + }, + _ => {}, + } + } + TokenType::String + }, + // Parse character + _ => TokenType::Unrecognized, + }; + tokens.push(Token { ttype, start, end }); + } + tokens +}