spring break work

This commit is contained in:
voidNUL 2024-03-16 18:27:14 -05:00
parent 5cef0b4563
commit dd7995ac31
7 changed files with 385 additions and 1165 deletions

1073
Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -7,7 +7,5 @@ edition = "2021"
[dependencies] [dependencies]
html_parser = "0.7" html_parser = "0.7"
minify-html = "0.15"
grass = "0.13"
toml = "0.8" toml = "0.8"
walkdir = "2.5" walkdir = "2.5"

View file

@ -1,12 +1,13 @@
<!doctype html>
<html lang="en"> <html lang="en">
<head> <head>
<link> <link "asdf">
<lg:include rel='css' href="./style.css" /> <lg:include rel='css' href="./style.css" />
</head> </head>
<body> <body>
<A href="fdfs"> asdf </A> <A href="fdfs"> asdf </A>
</body> </body>
</html> </html>

View file

@ -67,7 +67,7 @@ fn run_compiler() -> Result<()> {
fn main() { fn main() {
let test = include_str!("../simple.html").to_string(); let test = include_str!("../simple.html").to_string();
// let test = " <as> </as> ".to_string(); // let test = " <as> </as> ".to_string();
let r = parser::parse_html(&test); let r = parser::parse_html(&test).unwrap();
for l in r { for l in r {
println!("{l:?}"); println!("{l:?}");
} }

View file

@ -1,8 +1,48 @@
use std::collections::HashMap; use std::collections::HashMap;
use crate::trace::*;
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub enum ParseError {
InvalidTag,
MismatchedClosing { expected: String, found: String },
UnmatchedOpen(String),
UnmatchedClose(String),
VoidClosingTag(String),
Unknown,
}
use crate::trace::{self, WithContext};
impl From<ParseError> for trace::Error {
fn from(value: ParseError) -> Self {
let msg = match value {
ParseError::InvalidTag => "Failed to parse a tag".into(),
ParseError::MismatchedClosing { expected, found } => {
format!(
"Found closing tag '{}' where '{}' was expected",
found, expected
)
},
ParseError::UnmatchedOpen(s) => {
format!("The tag '{}' is opened, but never closed", s)
},
ParseError::UnmatchedClose(s) => {
format!("The tag '{}' is closed, but never opened", s)
},
ParseError::VoidClosingTag(s) => {
format!("The tag '{}' should not have a closing tag", s)
},
ParseError::Unknown => {
return trace::Error::new(
trace::ErrorKind::Unknown,
"Unknown error while parsing",
)
},
};
trace::Error::new(trace::ErrorKind::Parsing, msg)
}
}
#[derive(Clone, Debug, PartialEq)]
pub enum Lexeme<'a> { pub enum Lexeme<'a> {
OpenTag { OpenTag {
name: &'a str, name: &'a str,
@ -12,33 +52,82 @@ pub enum Lexeme<'a> {
CloseTag { CloseTag {
name: &'a str, name: &'a str,
}, },
Content(&'a str), Text(&'a str),
Doctype,
Comment,
} }
fn normalize_whitespace(s: &str) { fn normalize_whitespace(mut tail: &str) -> String {
// https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model/Whitespace // https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model/Whitespace
todo!() let mut _index = 0;
} let mut buffer = String::with_capacity(tail.len());
while !tail.is_empty() {
fn error(message: impl Into<String>) -> Error { match parse_whitespace_min(tail, 1, &mut _index) {
Error { Some((_, new_tail)) => {
kind: ErrorKind::Parsing, buffer.push(' ');
reason: message.into(), tail = new_tail;
backtrace: vec![], },
None => {},
}
let (chars, new_tail) =
parse_while(tail, |c| !c.is_whitespace(), &mut _index);
buffer.push_str(chars);
tail = new_tail
} }
buffer
} }
/// Try parsing single specific character /// Try parsing single specific character ignoring case
fn parse_char(i: &str, c: char) -> Option<(&str, &str)> { fn parse_char<'a>(
if i.starts_with(c) { tail: &'a str,
Some((&i[0..1], &i[1..])) c: char,
index: &mut usize,
) -> Option<(&'a str, &'a str)> {
if !tail.is_empty() && tail[0..1].eq_ignore_ascii_case(&c.to_string()) {
*index += 1;
Some((&tail[0..1], &tail[1..]))
} else { } else {
None None
} }
} }
// Parse until condition is not true for next character fn parse_str<'a>(
fn parse_while(tail: &str, condition: impl Fn(char) -> bool) -> (&str, &str) { tail: &'a str,
to_match: &'a str,
index: &mut usize,
) -> Option<(&'a str, &'a str)> {
if tail.len() < to_match.len() {
return None;
}
if tail[0..to_match.len()].eq_ignore_ascii_case(to_match) {
*index += to_match.len();
Some((&tail[0..to_match.len()], &tail[to_match.len()..]))
} else {
None
}
}
fn parse_until_str<'a>(
tail: &'a str,
to_match: &'a str,
index: &mut usize,
) -> Option<(&'a str, &'a str)> {
for i in 0..tail.len() {
let substr = &tail[0..i];
if substr.ends_with(to_match) {
*index += i;
return Some((&tail[0..i], &tail[i..]));
}
}
None
}
/// Parse until condition is not true for next character
fn parse_while<'a>(
tail: &'a str,
condition: impl Fn(char) -> bool,
index: &mut usize,
) -> (&'a str, &'a str) {
let mut end; let mut end;
let mut it = tail.char_indices(); let mut it = tail.char_indices();
'outer: loop { 'outer: loop {
@ -55,105 +144,166 @@ fn parse_while(tail: &str, condition: impl Fn(char) -> bool) -> (&str, &str) {
}, },
}; };
} }
*index += end;
(&tail[0..end], &tail[end..]) (&tail[0..end], &tail[end..])
} }
fn parse_whitespace(i: &str) -> (&str, &str) { fn parse_whitespace<'a>(i: &'a str, index: &mut usize) -> (&'a str, &'a str) {
parse_while(i, |c| c.is_whitespace()) parse_while(i, |c| c.is_whitespace(), index)
} }
fn parse_doctype(tail: &str) -> Option<(&str, &str)> { fn parse_whitespace_min<'a>(
const doctype_str = "<!DOCTYPE>" tail: &'a str,
min: usize,
index: &mut usize,
) -> Option<(&'a str, &'a str)> {
let mut new_index = 0;
let (ws, tail) = parse_whitespace(tail, &mut new_index);
if ws.len() < min {
None
} else {
*index += new_index;
Some((ws, tail))
}
} }
/// Try parsing all characters between two delimiter /// Try parsing all characters between two delimiter
/// characters /// characters
fn parse_delimited(i: &str, delimiter: char) -> Option<(&str, &str)> { fn parse_delimited<'a>(
let (_, tail) = parse_char(i, delimiter)?; i: &'a str,
let (value, tail) = parse_while(tail, |c| c != delimiter); delimiter: char,
let (_, tail) = parse_char(tail, delimiter)?; index: &mut usize,
) -> Option<(&'a str, &'a str)> {
let mut new_index = 0;
let (_, tail) = parse_char(i, delimiter, &mut new_index)?;
let (value, tail) = parse_while(tail, |c| c != delimiter, &mut new_index);
let (_, tail) = parse_char(tail, delimiter, &mut new_index)?;
*index += new_index;
Some((value, tail)) Some((value, tail))
} }
fn parse_tag_name(i: &str) -> Option<(&str, &str)> { fn parse_tag_name<'a>(
let (value, tail) = parse_while(i, |c| c.is_ascii_alphanumeric() || c == ':'); i: &'a str,
index: &mut usize,
) -> Option<(&'a str, &'a str)> {
let mut new_index = 0;
let (value, tail) = parse_while(
i,
|c| c.is_ascii_alphanumeric() || [':', '_', '-'].contains(&c),
&mut new_index,
);
if value.is_empty() { if value.is_empty() {
None None
} else { } else {
*index += new_index;
Some((value, tail)) Some((value, tail))
} }
} }
fn parse_attribute_key(i: &str) -> Option<(&str, &str)> { fn parse_attribute_key<'a>(
let (value, tail) = parse_while(i, |c| { i: &'a str,
!(['"', '\'', '>', '/', '='].contains(&c) || c.is_control()) index: &mut usize,
}); ) -> Option<(&'a str, &'a str)> {
let mut new_index = 0;
let (value, tail) = parse_while(
i,
|c| {
!(['"', '\'', '>', '/', '='].contains(&c)
|| c.is_control()
|| c.is_whitespace())
},
&mut new_index,
);
if value.is_empty() { if value.is_empty() {
None None
} else { } else {
*index += new_index;
Some((value, tail)) Some((value, tail))
} }
} }
fn parse_attribute_val(i: &str) -> Option<(&str, &str)> { fn parse_attribute_val<'a>(
i: &'a str,
index: &mut usize,
) -> Option<(&'a str, &'a str)> {
const SINGLE_QUOTE: char = '\''; const SINGLE_QUOTE: char = '\'';
const DOUBLE_QUOTE: char = '"'; const DOUBLE_QUOTE: char = '"';
let (value, tail) = parse_delimited(i, '\'') // Single quote delimit let mut new_index = 0;
.or_else(|| parse_delimited(i, '"')) // Double quote delimit let (value, tail) =
parse_delimited(i, SINGLE_QUOTE, &mut new_index) // Single quote delimit
.or_else(|| parse_delimited(i, DOUBLE_QUOTE, &mut new_index)) // Double quote delimit
.or_else(|| { // Unquoted .or_else(|| { // Unquoted
Some(parse_while(i, |c| { Some(parse_while(i, |c| {
!(c.is_whitespace() !(c.is_whitespace()
|| [SINGLE_QUOTE, DOUBLE_QUOTE, '=', '<', '>', '`'].contains(&c)) || [SINGLE_QUOTE, DOUBLE_QUOTE, '=', '<', '>', '`'].contains(&c))
})) }, &mut new_index))
})?; })?;
if value.is_empty() { *index += new_index;
None Some((value, tail))
} else {
Some((value, tail))
}
} }
/// Returns Option<((key, value), tail)> /// Returns Option<((key, value), tail)>
fn parse_key_val(tail: &str) -> Option<((&str, Option<&str>), &str)> { fn parse_key_val<'a>(
tail: &'a str,
index: &mut usize,
) -> Option<((&'a str, Option<&'a str>), &'a str)> {
let mut new_index = 0;
// Require whitespace // Require whitespace
let (ws, tail) = parse_whitespace(tail); let (_, tail) = parse_whitespace_min(tail, 1, &mut new_index)?;
if ws.is_empty() {
return None;
}
// Fail when no key found // Fail when no key found
let (key, tail) = parse_attribute_key(tail)?; let (key, tail) = parse_attribute_key(tail, &mut new_index)?;
let (_, tail) = parse_whitespace(tail); if let Some((_, tail)) = parse_char(
if let Some((_, tail)) = parse_char(tail, '=') { parse_whitespace(tail, &mut new_index).1,
let (_, tail) = parse_whitespace(tail); '=',
&mut new_index,
) {
let (_, tail) = parse_whitespace(tail, &mut new_index);
// Fail when = is not followed by value // Fail when = is not followed by value
let (val, tail) = parse_attribute_val(tail)?; let (val, tail) = parse_attribute_val(tail, &mut new_index)?;
Some(((key, Some(val)), tail)) let val = if val.is_empty() { None } else { Some(val) };
*index += new_index;
Some(((key, val), tail))
} else { } else {
*index += new_index;
Some(((key, None), tail)) Some(((key, None), tail))
} }
} }
// Tags that are implicitly self closing, ending in /> is optional
const VOID_ELEMENTS: [&str; 16] = [ const VOID_ELEMENTS: [&str; 16] = [
"area", "base", "br", "col", "command", "embed", "hr", "img", "input", "area", "base", "br", "col", "command", "embed", "hr", "img", "input",
"keygen", "link", "meta", "param", "source", "track", "wbr", "keygen", "link", "meta", "param", "source", "track", "wbr",
]; ];
fn parse_open_tag(tail: &str) -> Option<(Lexeme, &str)> { fn parse_open_tag<'a>(
tail: &'a str,
index: &mut usize,
) -> Result<(Lexeme<'a>, &'a str), ParseError> {
let mut new_index = 0;
// < // <
let (_, tail) = parse_char(tail, '<')?; let (_, tail) =
parse_char(tail, '<', &mut new_index).ok_or(ParseError::Unknown)?;
// tag name // tag name
let (name, mut tail) = parse_tag_name(tail)?; let (name, mut tail) =
parse_tag_name(tail, &mut new_index).ok_or(ParseError::InvalidTag)?;
// attributes // attributes
let mut attributes: HashMap<&str, Option<&str>> = HashMap::new(); let mut attributes: HashMap<&str, Option<&str>> = HashMap::new();
while let Some((kv, new_tail)) = parse_key_val(tail) { while let Some((kv, new_tail)) = parse_key_val(tail, &mut new_index) {
attributes.insert(kv.0, kv.1); attributes.insert(kv.0, kv.1);
tail = new_tail; tail = new_tail;
} }
let (_, tail) = parse_whitespace(tail); let (_, tail) = parse_whitespace(tail, &mut new_index);
let (is_void, tail) = parse_char(tail, '/').unwrap_or(("", tail)); let (is_void, tail) =
parse_char(tail, '/', &mut new_index).unwrap_or(("", tail));
let is_void = !is_void.is_empty() || VOID_ELEMENTS.contains(&name); let is_void = !is_void.is_empty() || VOID_ELEMENTS.contains(&name);
let (_, tail) = parse_char(tail, '>')?; let (_, tail) = match parse_char(tail, '>', &mut new_index) {
Some(( Some(v) => v,
None => {
return Err(ParseError::InvalidTag);
},
};
*index += new_index;
Ok((
Lexeme::OpenTag { Lexeme::OpenTag {
name, name,
attributes, attributes,
@ -163,37 +313,162 @@ fn parse_open_tag(tail: &str) -> Option<(Lexeme, &str)> {
)) ))
} }
fn parse_close_tag(tail: &str) -> Option<(Lexeme, &str)> { fn parse_close_tag<'a>(
let (_, tail) = parse_char(tail, '<')?; tail: &'a str,
let (_, tail) = parse_char(tail, '/')?; index: &mut usize,
let (name, tail) = parse_tag_name(tail)?; ) -> Result<(Lexeme<'a>, &'a str), ParseError> {
let (_, tail) = parse_whitespace(tail); let mut new_index = 0;
let (_, tail) = parse_char(tail, '>')?; let (_, tail) =
Some((Lexeme::CloseTag { name }, tail)) parse_char(tail, '<', &mut new_index).ok_or(ParseError::Unknown)?;
let (_, tail) =
parse_char(tail, '/', &mut new_index).ok_or(ParseError::Unknown)?;
let (name, tail) =
parse_tag_name(tail, &mut new_index).ok_or(ParseError::InvalidTag)?;
let (_, tail) = parse_whitespace(tail, &mut new_index);
let (_, tail) =
parse_char(tail, '>', &mut new_index).ok_or(ParseError::InvalidTag)?;
*index += new_index;
Ok((Lexeme::CloseTag { name }, tail))
} }
fn parse_text(tail: &str) -> Option<(Lexeme, &str)> { fn parse_doctype<'a>(
let (txt, tail) = parse_while(tail, |c| c != '<'); tail: &'a str,
index: &mut usize,
) -> Result<(Lexeme<'a>, &'a str), ParseError> {
let mut new_index = 0;
let mut closure = || -> Option<(&str, &str)> {
let (_, tail) = parse_str(tail, "<!doctype", &mut new_index)?;
let (_, tail) = parse_whitespace_min(tail, 1, &mut new_index)?;
let (_, tail) = parse_str(tail, "html", &mut new_index)?;
let (_, tail) = parse_whitespace(tail, &mut new_index);
parse_char(tail, '>', &mut new_index)
};
let (_, tail) = closure().ok_or(ParseError::Unknown)?;
*index += new_index;
Ok((Lexeme::Doctype, tail))
}
fn parse_comment<'a>(
tail: &'a str,
index: &mut usize,
) -> Result<(Lexeme<'a>, &'a str), ParseError> {
let mut new_index = 0;
let (_, tail) =
parse_str(tail, "<!--", &mut new_index).ok_or(ParseError::Unknown)?;
let (_, tail) =
parse_until_str(tail, "-->", &mut new_index).ok_or(ParseError::Unknown)?;
*index += new_index;
Ok((Lexeme::Comment, tail))
}
fn parse_text<'a>(
tail: &'a str,
index: &mut usize,
) -> Result<(Lexeme<'a>, &'a str), ParseError> {
let mut new_index = 0;
let (txt, tail) = parse_while(tail, |c| c != '<', &mut new_index);
if txt.is_empty() { if txt.is_empty() {
None Err(ParseError::Unknown)
} else { } else {
Some((Lexeme::Content(txt), tail)) *index += new_index;
Ok((Lexeme::Text(txt), tail))
} }
} }
pub fn parse_html(mut tail: &str) -> Vec<Lexeme> { fn or_keep_error<'a>(
let mut stack = vec![]; r: Result<(Lexeme<'a>, &'a str), ParseError>,
op: impl FnOnce() -> Result<(Lexeme<'a>, &'a str), ParseError>,
) -> Result<(Lexeme<'a>, &'a str), ParseError> {
match r {
Ok(val) => Ok(val),
Err(e) => op().map_err(|new_e| match e {
ParseError::Unknown => new_e,
_ => e,
}),
}
}
fn index_to_rc(input: &str, index: usize) -> (usize, usize) {
let (mut row, mut col) = (1, 1);
for c in input[0..index].chars() {
if c == '\n' {
row += 1;
col = 1;
} else {
col += 1;
}
}
(row, col)
}
pub fn parse_html(input: &str) -> trace::Result<Vec<Lexeme>> {
let mut tail = input;
let mut lexeme_stack = vec![];
let mut validation_stack = vec![];
let mut index = 0;
let err = |error: ParseError, index: usize| -> trace::Result<Vec<Lexeme>> {
let (row, col) = index_to_rc(input, index);
let e: trace::Error = error.into();
Err(e).ctx(format!("Starting at line {} character {}", row, col))
};
while !tail.is_empty() { while !tail.is_empty() {
let (_, new_tail) = parse_whitespace(tail); let (_, new_tail) = parse_whitespace(tail, &mut index);
if new_tail.is_empty() { if new_tail.is_empty() {
break; break;
} }
let (lm, new_tail) = parse_open_tag(new_tail) let result = or_keep_error(parse_open_tag(new_tail, &mut index), || {
.or_else(|| parse_close_tag(new_tail)) parse_close_tag(new_tail, &mut index)
.or_else(|| parse_text(new_tail)) });
.unwrap(); let result = or_keep_error(result, || parse_text(new_tail, &mut index));
stack.push(lm); let result = or_keep_error(result, || parse_comment(new_tail, &mut index));
let (lm, new_tail) =
match or_keep_error(result, || parse_doctype(new_tail, &mut index)) {
Ok(v) => v,
Err(e) => {
return err(e, index);
},
};
// Validate that open and close tags match
match lm {
Lexeme::OpenTag { name, is_void, .. } => {
if !is_void {
validation_stack.push(name);
}
},
Lexeme::CloseTag { name } => {
if VOID_ELEMENTS.contains(&name) {
return err(ParseError::VoidClosingTag(name.into()).into(), index);
}
if let Some(top) = validation_stack.pop() {
if name != top {
return err(
ParseError::MismatchedClosing {
expected: top.into(),
found: name.into(),
},
index,
);
}
} else {
return err(ParseError::UnmatchedClose(name.into()), index);
}
},
Lexeme::Comment => {
tail = new_tail;
continue;
},
_ => {},
};
lexeme_stack.push(lm);
tail = new_tail; tail = new_tail;
} }
stack if let Some(top) = validation_stack.pop() {
let e: trace::Error = ParseError::UnmatchedOpen(top.into()).into();
return Err(e).ctx("At end of file");
}
Ok(lexeme_stack)
} }

View file

@ -44,15 +44,26 @@ pub enum ErrorKind {
Unknown, Unknown,
} }
impl Display for Error { impl Display for ErrorKind {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!( write!(
f, f,
"{:?} error\nReason:\n\t{}\nBacktrace:\n", "{}",
self.kind, self.reason match self {
)?; ErrorKind::IO => "IO",
ErrorKind::Parsing => "PARSING",
ErrorKind::Compilation => "COMPILATION",
ErrorKind::Unknown => "UNKNOWN",
}
)
}
}
impl Display for Error {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "\n[{} ERROR] {}\nBacktrace:\n", self.kind, self.reason)?;
for s in self.backtrace.iter().rev() { for s in self.backtrace.iter().rev() {
write!(f, "\t{}\n", s)?; write!(f, "{}\n", s)?;
} }
Ok(()) Ok(())
} }
@ -129,9 +140,10 @@ where
S: Into<String>, S: Into<String>,
{ {
fn ctx(self, s: S) -> Result<T> { fn ctx(self, s: S) -> Result<T> {
match self { self.ok_or_else(|| Error {
Some(v) => Ok(v), kind: ErrorKind::Unknown,
None => Err(Error::new(ErrorKind::Unknown, "Missing expected value")), reason: "Missing expected value".into(),
} backtrace: vec![s.into()],
})
} }
} }

View file

@ -7,8 +7,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"> <meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="test_files/main-ad0a5132b4027392.css"> <link rel="stylesheet" href="test_files/main-ad0a5132b4027392.css">
<link rel="preload" href="https://logan-gatlin.com/web-2abb0afbb41a1a01_bg.wasm" as="fetch" type="application/wasm" <link rel="preload" href="https://logan-gatlin.com/web-2abb0afbb41a1a01_bg.wasm" as="fetch" type="application/wasm" crossorigin="">
crossorigin="">
<link rel="modulepreload" href="https://logan-gatlin.com/web-2abb0afbb41a1a01.js"> <link rel="modulepreload" href="https://logan-gatlin.com/web-2abb0afbb41a1a01.js">
<link <link
href="data:text/css,%3Ais(%5Bid*%3D'google_ads_iframe'%5D%2C%5Bid*%3D'taboola-'%5D%2C.taboolaHeight%2C.taboola-placeholder%2C%23credential_picker_container%2C%23credentials-picker-container%2C%23credential_picker_iframe%2C%5Bid*%3D'google-one-tap-iframe'%5D%2C%23google-one-tap-popup-container%2C.google-one-tap-modal-div%2C%23amp_floatingAdDiv%2C%23ez-content-blocker-container)%20%7Bdisplay%3Anone!important%3Bmin-height%3A0!important%3Bheight%3A0!important%3B%7D" href="data:text/css,%3Ais(%5Bid*%3D'google_ads_iframe'%5D%2C%5Bid*%3D'taboola-'%5D%2C.taboolaHeight%2C.taboola-placeholder%2C%23credential_picker_container%2C%23credentials-picker-container%2C%23credential_picker_iframe%2C%5Bid*%3D'google-one-tap-iframe'%5D%2C%23google-one-tap-popup-container%2C.google-one-tap-modal-div%2C%23amp_floatingAdDiv%2C%23ez-content-blocker-container)%20%7Bdisplay%3Anone!important%3Bmin-height%3A0!important%3Bheight%3A0!important%3B%7D"