tokens

2024-07-15 03:22:56 -05:00 · 2024-07-15 03:22:56 -05:00 · b55135245d
commit b55135245d
5 changed files with 242 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 /target
--- a/Cargo.lock
+++ b/Cargo.lock
@ -0,0 +1,7 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
 version = 3
 [[package]]
 name = "lang"
 version = "0.1.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,6 @@
 [package]
 name = "lang"
 version = "0.1.0"
 edition = "2021"
 [dependencies]
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,18 @@
 mod token;
 fn main() {
  repl();
 }
 pub fn repl() {
  let mut buffer = String::new();
  let stdin = std::io::stdin();
  loop {
    stdin.read_line(&mut buffer).unwrap();
    let tokens = token::tokenize(&buffer);
    for tok in tokens {
      println!("{} : {:?}", &buffer[tok.start..tok.end], tok.ttype);
    }
    buffer = String::new();
  }
 }
--- a/src/token.rs
+++ b/src/token.rs
@ -0,0 +1,210 @@
 #[derive(Debug, Clone)]
 pub enum TokenType {
  // Symbols
  LeftParen,
  RightParen,
  LeftSquare,
  RightSquare,
  LeftBrace,
  RightBrace,
  Comma,
  Dot,
  Plus,
  Minus,
  Star,
  Slash,
  Semicolon,
  Bang,
  BangEqual,
  Equal,
  DoubleEqual,
  Greater,
  GreaterEqual,
  Less,
  LessEqual,
  // Literals
  String,
  Character,
  Number(f64),
  // Words
  Ident,
  And,
  Or,
  Self_,
  Struct,
  True,
  False,
  Fn,
  If,
  Else,
  Nil,
  Print,
  Return,
  Super,
  Let,
  While,
  For,
  // Special
  Unrecognized,
  TooLong,
 }
 /// Type, index
 #[derive(Debug, Clone)]
 pub struct Token {
  pub ttype: TokenType,
  pub start: usize,
  pub end: usize,
 }
 pub fn tokenize(input: &str) -> Vec<Token> {
  let input_str = input;
  let mut input = input.char_indices().peekable();
  let mut tokens = vec![];
  'outer: loop {
    // Find next non-whitespace line
    let (start, c) = 'ws: loop {
      match input.next() {
        // Stop at end of input
        None => break 'outer,
        Some((index, character)) if !character.is_whitespace() => {
          break 'ws (index, character)
        },
        _ => {},
      }
    };
    let mut end = start + 1;
    let mut advance = || {};
    let ttype = match c {
      // Match single character tokens
      '(' => TokenType::LeftParen,
      ')' => TokenType::RightParen,
      '[' => TokenType::LeftSquare,
      ']' => TokenType::RightSquare,
      '{' => TokenType::LeftBrace,
      '}' => TokenType::RightBrace,
      ',' => TokenType::Comma,
      '.' => TokenType::Dot,
      '+' => TokenType::Plus,
      '-' => TokenType::Minus,
      '*' => TokenType::Star,
      '/' => TokenType::Slash,
      ';' => TokenType::Semicolon,
      // Match multicharacter tokens
      '!' => match input.peek() {
        Some((_, '=')) => {
          input.next();
          end += 1;
          TokenType::BangEqual
        },
        _ => TokenType::Bang,
      },
      '=' => match input.peek() {
        Some((_, '=')) => {
          input.next();
          end += 1;
          TokenType::DoubleEqual
        },
        _ => TokenType::Equal,
      },
      '<' => match input.peek() {
        Some((_, '=')) => {
          input.next();
          end += 1;
          TokenType::GreaterEqual
        },
        _ => TokenType::Greater,
      },
      '>' => match input.peek() {
        Some((_, '=')) => {
          input.next();
          end += 1;
          TokenType::LessEqual
        },
        _ => TokenType::Less,
      },
      // Match keywords, identifiers, and literals
      c if c.is_alphanumeric() => 'case: {
        // Scan full word
        while let Some((new_end, next)) = input.peek() {
          if next.is_alphanumeric() || *next == '_' {
            let _ = input.next();
          } else {
            end = *new_end;
            break;
          }
        }
        let word = &input_str[start..end];
        // Attempt to parse hex literal
        if let Some(s) =
          word.strip_prefix("0x").or_else(|| word.strip_prefix("0X"))
        {
          if let Ok(n) = u64::from_str_radix(s, 16) {
            break 'case TokenType::Number(n as f64);
          } else {
            break 'case TokenType::Unrecognized;
          }
        }
        // Attempt to parse binary literal
        if let Some(s) =
          word.strip_prefix("0b").or_else(|| word.strip_prefix("0B"))
        {
          if let Ok(n) = u64::from_str_radix(s, 2) {
            break 'case TokenType::Number(n as f64);
          } else {
            break 'case TokenType::Unrecognized;
          }
        }
        // Attempt to parse decimal literal
        if let Ok(f) = word.parse::<f64>() {
          break 'case TokenType::Number(f);
        }
        // Parse keyword or ident
        match word {
          "and" => TokenType::And,
          "or" => TokenType::Or,
          "self" => TokenType::Self_,
          "struct" => TokenType::Struct,
          "true" => TokenType::True,
          "false" => TokenType::False,
          "fn" => TokenType::Fn,
          "if" => TokenType::If,
          "else" => TokenType::Else,
          "nil" => TokenType::Nil,
          "print" => TokenType::Print,
          "return" => TokenType::Return,
          "super" => TokenType::Super,
          "let" => TokenType::Let,
          "while" => TokenType::While,
          "for" => TokenType::For,
          _ => TokenType::Ident,
        }
      },
      // Parse string
      '"' => {
        while let Some((new_end, next)) = input.next() {
          match next {
            '"' => {
              end = new_end + 1;
              break;
            },
            // Skip escapes and deal with them later
            '\\' => {
              let _ = input.next();
            },
            _ => {},
          }
        }
        TokenType::String
      },
      // Parse character
      _ => TokenType::Unrecognized,
    };
    tokens.push(Token { ttype, start, end });
  }
  tokens
 }